def test_generate_statistics_some_codes(self): needed_feature_codes = random.sample(self.feature_codes, k=2) file_processor = FileProcessor(self.input_file_path) actual_stats = file_processor.generate_statistics(needed_feature_codes) _, feature_codes, features = read_test_data_as_arrays( self.input_file_path) selected_indexes = [] for code in needed_feature_codes: indexes = np.where(feature_codes == code)[0].tolist() selected_indexes += indexes selected_features = features[selected_indexes] expected_stats = { "count": np.uint32(selected_features.shape[0]), "mean": selected_features.mean(axis=0), "std": selected_features.std(axis=0, ddof=1), "max": np.amax(selected_features, axis=0), "min": np.amin(selected_features, axis=0) } for metric in expected_stats: np.testing.assert_allclose(actual_stats[metric], expected_stats[metric])
def main(): fileConfig('logging_config.ini') db_util = DatabaseUtility() primary_ui = PrimaryUI() primary_ui.set_doc_names(db_util.get_saved_doc_names()) file_processor = FileProcessor() while True: event, file_names = primary_ui.Read() if event is not None: # Process the data if file_names[0] != '': if event == primary_ui.IMPORT: db_util.save_docs( file_processor.convert_file_names_to_name_data_dict( file_names[0])) # elif event == primary_ui.EXPORT: else: break
def __init__(self, input_file, input_dir, file_regex, latest, debug, recursive=False): """ Return an instance of JsonFileProcessor. Parameters: input_file (string): file (full path) to check for data input_dir (string): directory (full path) to check for data files latest (Boolean): check for latest files only debug (Boolean): enable debug logging recursive (Boolean): check the search directory recursively """ self.debug = debug root_logger.info("Debug: %s", debug) if input_file: self.file_names = FileProcessor.match_file(input_file, file_regex) root_logger.info("Found %d json files for %s in %s", self.file_count(), file_regex, input_file) if input_dir: self.file_names = FileProcessor.dir_to_files( input_dir, file_regex, latest, recursive) root_logger.info("Found %d json files for %s in %s", self.file_count(), file_regex, input_dir)
def test_run(self): mock_repo = mock.Mock(spec=FileRepoABC) mock_repo.get_files.return_value = [BytesIO(b"foo"), BytesIO(b"bar")] fp = FileProcessor(mock_repo, "some/path") results = fp.run() self.assertEqual(results, ["acbd18db4cc2f85cedef654fccc4a4d8", "37b51d194a7513e45b56f6524f2d51f2"])
def test_transform_features_all_codes(self): file_processor = FileProcessor(self.input_file_path) file_processor.transform_features(self.output_file_path) job_ids, feature_codes, features = read_test_data_as_arrays( self.input_file_path) means = features.mean(axis=0) expected_z_scores = stats.zscore( features, axis=0, ddof=1) # using sample standard deviation expected_argmaxs = np.argmax(features, axis=1) expected_maxs = np.amax(features, axis=1) expected_diffs = np.abs(expected_maxs - means[expected_argmaxs]) line_index = 0 with open(self.output_file_path) as f: next(f) for line in f: job_id, z_scores, argmax, diff = _parse_transformed_line(line) np.testing.assert_equal(job_id, job_ids[line_index]) np.testing.assert_allclose(z_scores, expected_z_scores[line_index]) np.testing.assert_equal(argmax, expected_argmaxs[line_index]) np.testing.assert_almost_equal(diff, expected_diffs[line_index]) line_index += 1
def main(): """ Use custom file processor to pull bcket hashes """ gcs_repo = GCSFileRepo(APP_CREDENTIALS_JSON, COVID_BUCKET) fp = FileProcessor(gcs_repo, None) gcs_bucket_hashes = fp.run() print(gcs_bucket_hashes)
def __init__(self, input_file, input_dir, db_params_dict, metric, debug): """Return a new instance of FitBitData given the location of the data files, paramters for accessing the database, and if the data should be stored in metric units.""" self.metric = metric self.fitbitdb = FitBitDB.FitBitDB(db_params_dict, debug) if input_file: self.file_names = FileProcessor.match_file(input_file, r'.*\.csv') if input_dir: self.file_names = FileProcessor.dir_to_files(input_dir, r'.*\.csv')
class SecondPassCrawler: session = attr.ib() navigator = attr.ib(init=False) file_processor = attr.ib(init=False) def __attrs_post_init__(self): self.navigator = SeleniumNavigator(loading_strategy='none') self.file_processor = FileProcessor() def exit(self): self.navigator.close_browser() self.session.close() def get_urls(self): # results = self.session.query(Report).all() # return (report.url for report in results) return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991'] def add_scrapelog_to_db(self, url, content, dtime): slog = ScrapeLog(scrape_date=dtime, raw_data=content, page_url=url) try: self.session.add(slog) self.session.commit() except Exception as e: self.session.rollback() logging.info(e) def crawl_download_link(self): parser = CSVLinkParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') url = self.navigator.get_current_url() self.navigator.click_link(parsed_link) logging.info('Clicking download link for csv file.') content, dtime = self.file_processor.process() self.add_scrapelog_to_db(url, content, dtime) self.file_processor.delete_csv() def crawl_view_contributions_ids(self): logging.info(f'Current page: {self.navigator.get_current_url()}') parser = ContributionsViewParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') self.navigator.click_link(parsed_link) self.navigator.wait_for_csv_link() self.crawl_download_link() def crawl(self): urls = self.get_urls() for url in urls: logging.info(f'Current url: {url}') self.navigator.navigate(url) self.navigator.wait_for_contributions_id() self.crawl_view_contributions_ids()
def __init__(self, input_file, input_dir, db_params_dict, metric, debug): self.metric = metric self.mshealth_db = MSHealthDB.MSHealthDB(db_params_dict, debug) if input_file: self.file_names = FileProcessor.match_file( input_file, r'Daily_Summary_.*\.csv') if input_dir: self.file_names = FileProcessor.dir_to_files( input_dir, r'Daily_Summary_.*\.csv')
def main(): """ :return: """ file_processor = FileProcessor("") db_stock_data = StockData(config['postgresql']['host'], config['postgresql']['database']) # get stock symbols from data file # pharma_stocks = file_processor.read_file("lookup_data/pharma_stocks") fortune_500_0_stocks = file_processor.read_file("lookup_data/fortune_500_0") s_p_500_data = pd.read_csv("lookup_data/s_p_500.csv", delimiter=',') s_p_500_data_stocks = s_p_500_data['stock'].tolist() stock_data_list = [] dt_utc_now = datetime.utcnow() stocks = s_p_500_data_stocks + fortune_500_0_stocks invalid_data = [] print('getting data from yahoo . . .') for stock in stocks: stock_data = get_stock_info(stock, dt_utc_now) if stock_data: stock_data_list.append(stock_data) else: invalid_data.append(stock) file_processor.save_file('data/fortune_500_0.json',str(stock_data_list)) print('adding to database . . .') for stock_data in stock_data_list: try: db_stock_data.insert_stock_data(stock_data) except Exception as ex: # todo log exception invalid_data.append(stock_data.name) continue print('invalid_data:') print(invalid_data) # ['AET', 'APC', 'ANDV', 'BHGE', 'BBT', 'BF.B', 'CA', 'CSRA', 'DWDP', 'DPS', 'EVHC', # 'ESRX', 'GGP', 'HCP', 'LLL', 'LUK', 'KORS', 'MON', 'NFX', 'PX', 'RHT', 'COL', # 'SCG', 'SYMC', 'TWX', 'TMK', 'TSS', 'WYN', 'XL'] # ['AET', 'APC', 'ANDV', 'BHGE', 'BBT', 'BF.B', 'CA', 'CSRA', 'DWDP', 'DPS', 'EVHC', # 'ESRX', 'GGP', 'HRS', 'HCP', 'LLL', 'KORS', 'MON', 'NFX', 'PX', 'RHT', 'COL', 'SCG', # 'TWX', 'TMK', 'TSS', 'WYN', 'XL', 'BRK.B', 'GD', 'JEC', 'LUK', 'MCK', 'CRM', # 'STI', 'SYMC', 'VIAB', 'MCK', 'CRM'] print('total records added: {0}'.format(len(stock_data_list)))
def __init__(self, input_file, input_dir, db_params_dict, metric, debug): """Return an instance of MSHealthData given an input file or files and information on the databse to put it in.""" self.metric = metric self.mshealth_db = MSHealthDB.MSHealthDB(db_params_dict, debug) if input_file: self.file_names = FileProcessor.match_file( input_file, r'Daily_Summary_.*\.csv') if input_dir: self.file_names = FileProcessor.dir_to_files( input_dir, r'Daily_Summary_.*\.csv')
def test_zip_bytes_io(s3): file_processor = FileProcessor({}, {}, s3) actual = file_processor._zip_bytes_io(pytest.bucket_origin, pytest.key) assert type(actual) is BytesIO with ZipFile(actual) as zip_obj: assert zip_obj.namelist() == [pytest.file_zipped] with zip_obj.open(pytest.file_zipped) as file_obj: assert file_obj.read() == pytest.body_zipped.encode()
def test_upload_file_obj(s3, s3_prefix): bytes_io = BytesIO() key = f'{s3_prefix}/{pytest.file_zipped}' os.environ['BUCKET_PROCESSED'] = pytest.bucket_processed file_processor = FileProcessor({}, {}, s3) zip_bytes_io = file_processor._zip_bytes_io(pytest.bucket_origin, pytest.key) file_processor._upload_file_obj(zip_bytes_io) s3.download_fileobj(pytest.bucket_processed, key, bytes_io) assert type(bytes_io) is BytesIO assert bytes_io.getvalue() == pytest.body_zipped.encode()
def test_valid_bucket_key_list(event): file_processor = FileProcessor(event, {}, {}) actual = file_processor._bucket_key_list() assert type(actual) is list for i in range(len(event)): assert 'bucket' in actual[i] assert actual[i]['bucket'] == event['Records'][i]['s3']['bucket'][ 'name'] assert 'key' in actual[i] assert actual[i]['key'].startswith('test') assert actual[i]['key'].endswith('.zip')
def __init__(self, input_file, input_dir, db_params_dict, metric, debug): self.metric = metric self.mshealth_db = MSHealthDB.MSHealthDB(db_params_dict, debug) self.cols_map = { 'Date': ('timestamp', CsvImporter.map_mdy_date), 'Weight': ('weight', MSVaultData.__map_weight), } if input_file: self.file_names = FileProcessor.match_file( input_file, r'HealthVault_Weight_.*\.csv') if input_dir: self.file_names = FileProcessor.dir_to_files( input_dir, r'HealthVault_Weight_.*\.csv')
def __init__(self, input_file, input_dir, db_params_dict, metric, debug): """Return an instance of MSVaultData given an input file or files and information on the databse to put it in.""" self.metric = metric self.mshealth_db = MSHealthDB.MSHealthDB(db_params_dict, debug) self.cols_map = { 'Date': ('timestamp', CsvImporter.map_mdy_date), 'Weight': ('weight', MSVaultData.__map_weight), } if input_file: self.file_names = FileProcessor.match_file( input_file, r'HealthVault_Weight_.*\.csv') if input_dir: self.file_names = FileProcessor.dir_to_files( input_dir, r'HealthVault_Weight_.*\.csv')
def test_run_returns_md5_of_contents(self): with mock.patch('file_processor.storage'): fp = FileProcessor('some/path') with mock.patch.object(fp, 'bucket') as mock_bucket: mock_blob = mock.Mock(name='mock_blob') def mock_download_to_file(bytez): bytez.write(b'foobarbaz') mock_blob.download_to_file = mock_download_to_file mock_bucket.list_blobs.return_value = [mock_blob] # WHEN results = fp.run() # THEN self.assertEqual(results, ['6df23dc03f9b64cc38a0fc1483df6e21'])
def test_transform_features_some_codes(self): selected_codes = random.sample(self.feature_codes, k=2) file_processor = FileProcessor(self.input_file_path) file_processor.transform_features(self.output_file_path, selected_codes) job_ids, feature_codes, features = read_test_data_as_arrays( self.input_file_path) selected_indexes = [] for code in selected_codes: indexes = np.where(feature_codes == code)[0].tolist() selected_indexes += indexes selected_indexes = sorted(selected_indexes) selected_job_ids = job_ids[selected_indexes] selected_features = features[selected_indexes] means = selected_features.mean(axis=0) expected_z_scores = stats.zscore( selected_features, axis=0, ddof=1) # using sample standard deviation expected_argmaxs = np.argmax(selected_features, axis=1) expected_maxs = np.amax(selected_features, axis=1) expected_diffs = np.abs(expected_maxs - means[expected_argmaxs]) line_index = 0 with open(self.output_file_path) as f: next(f) for line in f: job_id, z_scores, argmax, diff = _parse_transformed_line(line) np.testing.assert_equal(job_id, selected_job_ids[line_index]) np.testing.assert_allclose(z_scores, expected_z_scores[line_index]) np.testing.assert_equal(argmax, expected_argmaxs[line_index]) np.testing.assert_almost_equal(diff, expected_diffs[line_index]) line_index += 1 self.assertEqual(line_index, len(selected_indexes))
def backup(config, database): """ Process directoreis to be backed up """ logger = logging.getLogger('mylog') clean_removed(config, database) if not consistency_check(config, database): logger.warning('Consistency check detected problems!') if not query_yes_no('Continue?'): sys.exit() dirs = config.get('Backup', 'to_backup').split() logger.info('Directories to backup: ' + ','.join(dirs)) exclude_dirs = config.get('Backup', 'to_exclude').split() logger.info('Directories to exclude: ' + ','.join(exclude_dirs)) file_proc = FileProcessor(config, database, encrypt_file) #Count files to show progress later total = 0 for directory in dirs: for subdir, _, files in os.walk(directory): if subdir in exclude_dirs: continue for single_file in files: fpath = os.path.join(subdir, single_file) total = total + 1 count = 0 for directory in dirs: logger.debug('Processing directory' + directory) for subdir, dirs, files in os.walk(directory): if subdir in exclude_dirs: logger.debug('Skipping directory' + subdir) continue for single_file in files: fpath = os.path.join(subdir, single_file) logger.debug('Processing file ' + fpath) logger.info(str((count * 100) / total) + ' %') file_proc.process(fpath) count = count + 1
def __init__(self, input_dir, latest, measurement_system, debug): logger.info("Processing daily FIT data") self.measurement_system = measurement_system self.debug = debug if input_dir: self.file_names = FileProcessor.dir_to_files( input_dir, Fit.file.name_regex, latest, True)
def main(): print('IN Code Generator started....\nAnalysing input file...') input_file_path, package_name = parse_args() input_file_contents = FileProcessor.get_file_contents(input_file_path) if input_file_contents: file_contents: [ FileContent ] = CodeProcessor.process_file_contents(input_file_contents) FileProcessor.remove_output_dir() for file_content in file_contents: file_content.package_name = package_name if file_content: file_exports: FileProcessor = FileProcessor(file_content) file_exports.process_and_export_templates()
def copy_monitoring(self, monitoring_dir, latest): """Copy daily monitoring data FIT files from a USB mounted Garmin device to the given directory.""" device_monitoring_dir = GarminDBConfigManager.device_monitoring_dir(self.device_mount_dir) logger.info("Copying monitoring files from %s to %s", device_monitoring_dir, monitoring_dir) file_names = FileProcessor.dir_to_files(device_monitoring_dir, Fit.file.name_regex, latest) for file in progressbar.progressbar(file_names): shutil.copy(file, monitoring_dir)
def process_input(self, f_name, query): """ @summary: This method process the input file using FileProcessor class and returns the output @param f_name:file name to be processed @param query:query to be searched @return: Dictionary with query as the key and list of top found elems as the value """ fp_obj = FileProcessor() data_leftover = "" with open(f_name) as fp: for chunk in fp_obj.read_in_chunks(fp, config.CHUNK_SIZE): curr_list, data_leftover = self.convert_to_list( (data_leftover + chunk).strip('[]')) self.prefix_match(curr_list, query) return self.convert_to_dict(self.result_list, query)
def test_generate_statistics_all_codes(self): file_processor = FileProcessor(self.input_file_path) actual_stats = file_processor.generate_statistics() _, _, features = read_test_data_as_arrays(self.input_file_path) expected_stats = { "count": np.uint32(features.shape[0]), "mean": features.mean(axis=0), "std": features.std(axis=0, ddof=1), "max": np.amax(features, axis=0), "min": np.amin(features, axis=0) } for metric in expected_stats: np.testing.assert_allclose(actual_stats[metric], expected_stats[metric])
class TestFileProcessor(TestCase): def setUp(self): self.processor = FileProcessor() def getFakeFile(self, string): fake_file = FakeFile() fake_file.write(string) fake_file.seek(0) return fake_file def test_readFile_returns_list_of_words_from_file(self): test_str = "Test String" self.assertTrue( isinstance(self.processor.readFile(self.getFakeFile(test_str)), list)) def test_readFile_returns_list_of_words_from_file_splitted_by_space_by_default( self): test_str = "Test String" self.assertEqual(self.processor.readFile(self.getFakeFile(test_str)), [["Test", "String"]]) def test_readFile_returns_list_of_words_from_file_splitted_by_tab(self): test_str = "Test\tString" self.assertEqual( self.processor.readFile(self.getFakeFile(test_str), "\t"), [["Test", "String"]]) def test_readFile_returns_comma_separated_values_as_list(self): test_str = "test,str" self.assertEqual( self.processor.readFile(self.getFakeFile(test_str), ","), [["test", "str"]]) def test_readFile_returns_each_line_as_new_list(self): test_str = "test,this\nline,str" self.assertEqual( self.processor.readFile(self.getFakeFile(test_str), ","), [["test", "this"], ["line", "str"]]) def test_retrieveData_returns_dict_of_variables_and_last_element_as_desired_class( self): test_list = ["test_var1", "test_var2", "test_var3", "90", "test_class"] (data, des_class) = self.processor.retrieveData(test_list) self.assertTrue(isinstance(data, dict)) self.assertEqual(des_class, "test_class") def test_retrieveData_returns_string_and_number_variables_in_separate_list( self): test_list = [ "test_var1", "123.5samp", "sadkf123", "90", "test_var2", "3434.134", "test_class" ] (data, des_class) = self.processor.retrieveData(test_list) self.assertEqual( data, { "string": ["test_var1", "123.5samp", "sadkf123", "test_var2"], "num": [90, 3434.134] })
def test_processor(feature_size, chunksize): """Tests FileProcessor. :param feature_size: int, size of feature vector. :param chunksize: int, FileProcessor chunk size. """ sep = '\t' n_rows = 50 feature = 3 with TemporaryDirectory() as dir: input_path = os.path.join(dir, 'data.tsv') output_path = os.path.join(dir, 'data_proc.tsv') data, feature_values = generate_data(n_rows=n_rows, feature=feature, feature_size=feature_size, seed=42) data.to_csv(input_path, sep=sep, index=False) reader_params = { 'chunksize': chunksize, 'sep': sep, } transformers = ( Standardizer, MaxIndex, MaxFeatureAbsMeanDiff, ) processor = FileProcessor(transformers, reader_params=reader_params) processor.train(input_path) processor.process(input_path, output_path) processed = pd.read_csv(output_path, sep=sep) # check feature_{i}_stand_{index} expected_stand = (feature_values - feature_values.mean(axis=0)) / feature_values.std(axis=0, ddof=1) stand = processed.filter(regex=f'feature_{feature}_stand_[0-9]+') assert np.allclose(expected_stand, stand) assert np.allclose(stand.mean(axis=0), 0) assert np.allclose(stand.std(axis=0, ddof=1), 1) # check max_feature_{i}_index expected_max = feature_values.max(axis=1) max_index = processed[f'max_feature_{feature}_index'].values max_mask = (max_index.reshape( (-1, 1)) == np.arange(feature_values.shape[1]).reshape((1, -1))) fact_max = feature_values[max_mask] assert np.allclose(expected_max, fact_max) # check max_feature_{i}_abs_mean_diff expected_max_mean = np.broadcast_to(feature_values.mean(axis=0), shape=max_mask.shape)[max_mask] expected_abs_mean_diff = np.abs(expected_max - expected_max_mean) abs_mean_diff = processed[f'max_feature_{feature}_abs_mean_diff'] assert np.allclose(expected_abs_mean_diff, abs_mean_diff)
class FileHandler(PatternMatchingEventHandler): def __init__(self): PatternMatchingEventHandler.__init__(self, patterns=["*.txt"]) self.file_processor = FileProcessor() def on_created(self, event): logging.info("New file: {file}".format(file=event.src_path)) self.process(event) def process(self, event): filename = event.src_path data = self.file_processor.parse_data(filename)
def test_check_lookup(self): expected_result = FileProcessor.get_rate_code_lookup(TEST_ZIP_FILE) # Check that inputs processed correctly and zips return the right region self.assertEqual(expected_result['36749'], 'AL-11') self.assertEqual(expected_result['36703'], 'AL-11') self.assertEqual(expected_result['84310'], 'UT-2') # Check that leading zeros were maintained self.assertEqual(expected_result['05770'], 'VT-1') # Check the conflicted rows were dropped self.assertIsNone(expected_result.get('84409'))
def __init__(self, input_dir, latest, measurement_system, debug): """ Return an instance of GarminMonitoringFitData. Parameters: input_dir (string): directory (full path) to check for monitoring data files latest (Boolean): check for latest files only measurement_system (enum): which measurement system to use when importing the files debug (Boolean): enable debug logging """ logger.info("Processing daily FIT data") self.measurement_system = measurement_system self.debug = debug if input_dir: self.file_names = FileProcessor.dir_to_files( input_dir, Fit.file.name_regex, latest, True)
def __init__(self, input_dir, latest, measurement_system, debug): """ Return an instance of GarminTcxData. Parameters: db_params_dict (dict): configuration data for accessing the database input_dir (string): directory (full path) to check for data files latest (Boolean): check for latest files only measurement_system (enum): which measurement system to use when importing the files debug (Boolean): enable debug logging """ logger.debug("Processing activities tcx data") self.measurement_system = measurement_system self.debug = debug if input_dir: self.file_names = FileProcessor.dir_to_files( input_dir, self.tcx_filename_regex, latest)
def test_find_second_lowest(self): """ Test whether the find second lowest function works on a small set of test data """ expected_result = FileProcessor.get_second_lowest_cost_lookup( TEST_PLAN_FILE) # Check that the two region that should have second best rates are included and correct self.assertEqual(expected_result['GA-7'], 300.62) self.assertEqual(expected_result['MI-4'], 150.6767) # Check that two plan that were both the lowest returns the third item as the second lowest # (see assumption in comments) self.assertEqual(expected_result['TX-15'], 260.05) # Check that only one silver isn't added to the final dictionary self.assertNotIn('FL-60', expected_result) # Check that no silver plan is not added to the final dictionary self.assertNotIn('IL-5', expected_result)
def file_read (filename): fp = FileProcessor (filename, ' ') return fp.get_lines_as_array ()
def __init__(self): PatternMatchingEventHandler.__init__(self, patterns=["*.txt"]) self.file_processor = FileProcessor()