def test_processor(feature_size, chunksize): """Tests FileProcessor. :param feature_size: int, size of feature vector. :param chunksize: int, FileProcessor chunk size. """ sep = '\t' n_rows = 50 feature = 3 with TemporaryDirectory() as dir: input_path = os.path.join(dir, 'data.tsv') output_path = os.path.join(dir, 'data_proc.tsv') data, feature_values = generate_data(n_rows=n_rows, feature=feature, feature_size=feature_size, seed=42) data.to_csv(input_path, sep=sep, index=False) reader_params = { 'chunksize': chunksize, 'sep': sep, } transformers = ( Standardizer, MaxIndex, MaxFeatureAbsMeanDiff, ) processor = FileProcessor(transformers, reader_params=reader_params) processor.train(input_path) processor.process(input_path, output_path) processed = pd.read_csv(output_path, sep=sep) # check feature_{i}_stand_{index} expected_stand = (feature_values - feature_values.mean(axis=0)) / feature_values.std(axis=0, ddof=1) stand = processed.filter(regex=f'feature_{feature}_stand_[0-9]+') assert np.allclose(expected_stand, stand) assert np.allclose(stand.mean(axis=0), 0) assert np.allclose(stand.std(axis=0, ddof=1), 1) # check max_feature_{i}_index expected_max = feature_values.max(axis=1) max_index = processed[f'max_feature_{feature}_index'].values max_mask = (max_index.reshape( (-1, 1)) == np.arange(feature_values.shape[1]).reshape((1, -1))) fact_max = feature_values[max_mask] assert np.allclose(expected_max, fact_max) # check max_feature_{i}_abs_mean_diff expected_max_mean = np.broadcast_to(feature_values.mean(axis=0), shape=max_mask.shape)[max_mask] expected_abs_mean_diff = np.abs(expected_max - expected_max_mean) abs_mean_diff = processed[f'max_feature_{feature}_abs_mean_diff'] assert np.allclose(expected_abs_mean_diff, abs_mean_diff)
def backup(config, database): """ Process directoreis to be backed up """ logger = logging.getLogger('mylog') clean_removed(config, database) if not consistency_check(config, database): logger.warning('Consistency check detected problems!') if not query_yes_no('Continue?'): sys.exit() dirs = config.get('Backup', 'to_backup').split() logger.info('Directories to backup: ' + ','.join(dirs)) exclude_dirs = config.get('Backup', 'to_exclude').split() logger.info('Directories to exclude: ' + ','.join(exclude_dirs)) file_proc = FileProcessor(config, database, encrypt_file) #Count files to show progress later total = 0 for directory in dirs: for subdir, _, files in os.walk(directory): if subdir in exclude_dirs: continue for single_file in files: fpath = os.path.join(subdir, single_file) total = total + 1 count = 0 for directory in dirs: logger.debug('Processing directory' + directory) for subdir, dirs, files in os.walk(directory): if subdir in exclude_dirs: logger.debug('Skipping directory' + subdir) continue for single_file in files: fpath = os.path.join(subdir, single_file) logger.debug('Processing file ' + fpath) logger.info(str((count * 100) / total) + ' %') file_proc.process(fpath) count = count + 1
class SecondPassCrawler: session = attr.ib() navigator = attr.ib(init=False) file_processor = attr.ib(init=False) def __attrs_post_init__(self): self.navigator = SeleniumNavigator(loading_strategy='none') self.file_processor = FileProcessor() def exit(self): self.navigator.close_browser() self.session.close() def get_urls(self): # results = self.session.query(Report).all() # return (report.url for report in results) return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991'] def add_scrapelog_to_db(self, url, content, dtime): slog = ScrapeLog(scrape_date=dtime, raw_data=content, page_url=url) try: self.session.add(slog) self.session.commit() except Exception as e: self.session.rollback() logging.info(e) def crawl_download_link(self): parser = CSVLinkParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') url = self.navigator.get_current_url() self.navigator.click_link(parsed_link) logging.info('Clicking download link for csv file.') content, dtime = self.file_processor.process() self.add_scrapelog_to_db(url, content, dtime) self.file_processor.delete_csv() def crawl_view_contributions_ids(self): logging.info(f'Current page: {self.navigator.get_current_url()}') parser = ContributionsViewParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') self.navigator.click_link(parsed_link) self.navigator.wait_for_csv_link() self.crawl_download_link() def crawl(self): urls = self.get_urls() for url in urls: logging.info(f'Current url: {url}') self.navigator.navigate(url) self.navigator.wait_for_contributions_id() self.crawl_view_contributions_ids()
header = True for seed in tqdm(range(100), desc='generating data'): data, _ = generate_data( n_rows=100_000, feature=feature, feature_size=512, seed=seed ) data.to_csv( input_path, sep=sep, index=False, mode='a', header=header ) header = False reader_params = { 'chunksize': 10_000, 'sep': sep, } transformers = ( Standardizer, MaxIndex, MaxFeatureAbsMeanDiff, ) processor = FileProcessor(transformers, reader_params=reader_params) processor.train(input_path) processor.process(input_path, output_path)
class SecondPassCrawler: session = attr.ib() navigator = attr.ib(init=False) file_processor = attr.ib(init=False) letter = attr.ib() def __attrs_post_init__(self): logging.info('attrs post init called') self.navigator = SeleniumNavigator(loading_strategy='none', letter=self.letter) self.file_processor = FileProcessor(letter=self.letter) def exit(self): self.navigator.close_browser() self.session.close() def get_urls(self): _ids = self.session.query(Candidate).filter( Candidate.Lastname.ilike('zorn')).all() #ids_ = \ #self.session.query(Candidate).filter(Lastname.like("%z%")).all() reports = [] for _id in _ids: results = \ self.session.query(Report).filter_by(CandidateId=_id.CandidateId).all() logging.info(results) for result in results: reports.append((result.CandidateId, result.Url)) return reports #return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991'] def add_scrapelog_to_db(self, _id, url, content, dtime): slog = ScrapeLog(CandidateId=_id, ScrapeDate=dtime, RawData=content, PageURL=url) try: self.session.add(slog) self.session.commit() except Exception as e: self.session.rollback() logging.info(e) def crawl_download_link(self, _id): parser = CSVLinkParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') url = self.navigator.get_current_url() self.navigator.click_link(parsed_link) logging.info('Clicking download link for csv file.') content, dtime = self.file_processor.process() logging.info('Adding scrapelog to database') self.add_scrapelog_to_db(_id, url, content, dtime) self.file_processor.delete_csv() def crawl_view_contributions_ids(self, _id): logging.info(f'Current page: {self.navigator.get_current_url()}') parser = ContributionsViewParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') self.navigator.click_link(parsed_link) self.navigator.wait_for_csv_link() self.crawl_download_link(_id) def crawl(self): for _id, url in self.get_urls(): logging.info(f'Current url: {url}') self.navigator.navigate(url) self.navigator.wait_for_contributions_id() self.crawl_view_contributions_ids(_id)
parser.add_argument( '-o', '--output', default=os.path.join(DATA_DIR, OUTPUT_FILE), help='Path to file, where processed data will be stored.', ) parser.add_argument( '-c', '--chunk_size', default=128, help='Rows to process at a time.', ) args = parser.parse_args() if __name__ == '__main__': train_path = args.train test_path = args.input output_path = args.output reader_params = { 'chunksize': args.chunk_size, 'sep': SEPARATOR, } transformers = ( Standardizer, MaxIndex, MaxFeatureAbsMeanDiff, ) processor = FileProcessor(transformers, reader_params=reader_params) processor.train(train_path) processor.process(test_path, output_path)