def test_errors_csv(self, mock_check_cron): folder_prefix = 'dummy-prefix-2018-03-22/' test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'person.csv', ".\n .,.,.") main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) bucket_item_names = [ item['name'] for item in list_bucket_result if item['name'].startswith(folder_prefix) ] expected_items = ['person.csv'] + common.IGNORE_LIST expected_items = [folder_prefix + item for item in expected_items] self.assertSetEqual(set(bucket_item_names), set(expected_items)) # check content of the file is correct actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.ERRORS_CSV) with open(test_util.BAD_PERSON_FILE_BQ_LOAD_ERRORS_CSV, 'r') as f: expected = f.read() self.assertEqual(expected, actual_result)
def test_all_files_unparseable_output(self, mock_check_cron): # TODO possible bug: if no pre-existing table, results in bq table not found error folder_prefix = 'dummy-prefix-2018-03-22/' for cdm_table in common.CDM_FILES: test_util.write_cloud_str(self.hpo_bucket, folder_prefix + cdm_table, ".\n .") main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) bucket_item_names = [ item['name'] for item in list_bucket_result if item['name'].startswith(folder_prefix) ] expected_items = common.CDM_FILES + common.IGNORE_LIST expected_items = [ folder_prefix + item_name for item_name in expected_items ] self.assertSetEqual(set(bucket_item_names), set(expected_items)) # check content of the file is correct actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.RESULT_CSV) actual_result = resources._csv_file_to_list( StringIO.StringIO(actual_result)) expected = [{ 'cdm_file_name': cdm_file_name, 'found': '1', 'parsed': '0', 'loaded': '0' } for cdm_file_name in common.CDM_FILES] self.assertEqual(expected, actual_result)
def test_bad_file_names(self, mock_check_cron): folder_prefix = 'dummy-prefix-2018-03-22/' exclude_file_list = ["person_final.csv", "condition_occurence.csv", # misspelled "avisit_occurrence.csv", "procedure_occurrence.tsv"] # unsupported file extension exclude_file_list = [folder_prefix + item for item in exclude_file_list] expected_result_items = [] for file_name in exclude_file_list: test_util.write_cloud_str(self.hpo_bucket, file_name, ".") expected_item = dict(file_name=file_name.split('/')[1], message=main.UNKNOWN_FILE) expected_result_items.append(expected_item) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check content of the bucket is correct expected_bucket_items = exclude_file_list + [folder_prefix + item for item in common.IGNORE_LIST] # [common.RESULT_CSV, common.WARNINGS_CSV] list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) actual_bucket_items = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items)) # check content of the warnings file is correct actual_result = test_util.read_cloud_file(self.hpo_bucket, folder_prefix + common.WARNINGS_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list(actual_result_file) # sort in order to compare expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items)
def _test_validate_missing_files_output(self, mock_check_cron): # enable exception propagation as described at https://goo.gl/LqDgnj folder_prefix = 'dummy-prefix-2018-03-22/' main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result files were placed in bucket bucket_items = gcs_utils.list_bucket(self.hpo_bucket) item_names = [] for item in bucket_items: item_names.append(item['name']) for ignore_file in common.IGNORE_LIST: self.assertIn(folder_prefix + ignore_file, item_names) # check content of result.csv is correct # TODO fix this for all cdm files and use object comparison actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.RESULT_CSV) actual = resources._csv_file_to_list( StringIO.StringIO(actual_result)) expected = [{ 'cdm_file_name': cdm_file_name, 'found': '0', 'parsed': '0', 'loaded': '0' } for cdm_file_name in common.REQUIRED_FILES] self.assertEqual(expected, actual) self.assertFalse( main.all_required_files_loaded(test_util.FAKE_HPO_ID, folder_prefix))
def test_errors_csv(self, mock_check_cron): folder_prefix = 'dummy-prefix-2018-03-22/' test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'person.csv', ".\n .,.,.") main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) bucket_item_names = [ item['name'] for item in list_bucket_result if item['name'].startswith(folder_prefix) ] expected_items = ['person.csv'] + common.IGNORE_LIST expected_items = [folder_prefix + item for item in expected_items] self.assertSetEqual(set(bucket_item_names), set(expected_items)) # check content of the file is correct actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.ERRORS_CSV) actual = resources._csv_file_to_list( StringIO.StringIO(actual_result)) for row in actual: row.pop('message', None) expected = [{'file_name': 'person.csv', 'type': 'error'}] self.assertEqual(actual, expected)
def test_validate_five_persons_success(self, mock_check_cron): prefix = 'dummy-prefix-2018-03-22/' expected_result_items = resources._csv_to_list( test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV) json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID) # upload all five_persons files for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=prefix) expected_tables = [ 'person', 'visit_occurrence', 'condition_occurrence', 'procedure_occurrence', 'drug_exposure', 'measurement' ] cdm_files = [table + '.csv' for table in expected_tables] main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket expected_object_names = cdm_files + common.IGNORE_LIST + json_export_files expected_objects = [ prefix + item for item in expected_object_names ] list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) actual_objects = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_objects), set(actual_objects)) # result says file found, parsed, loaded actual_result = test_util.read_cloud_file( self.hpo_bucket, prefix + common.RESULT_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list( actual_result_file) expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items) self.assertTrue( main.all_required_files_loaded(test_util.FAKE_HPO_ID, folder_prefix=prefix)) # check tables exist and are clustered as expected for table in expected_tables: fields_file = os.path.join(resources.fields_path, table + '.json') table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table) table_info = bq_utils.get_table_info(table_id) with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self.table_has_clustering(table_info)
def test_pii_files_ignore(self, mock_check_cron): folder_prefix = 'dummy-prefix-2018-03-22/' test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'pii_person.csv', contents_str='.') main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) actual_result = test_util.read_cloud_file(self.hpo_bucket, folder_prefix + common.WARNINGS_CSV) with open(test_util.EMPTY_WARNINGS_CSV, 'r') as f: expected = f.read() self.assertEqual(expected, actual_result)
def test_validate_five_persons_success(self, mock_check_cron): prefix = 'dummy-prefix-2018-03-22/' expected_result_items = resources._csv_to_list( test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV) json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID) # upload all five_persons files for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=prefix) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was putin bucket expected_bucket_items = common.REQUIRED_FILES + common.IGNORE_LIST + json_export_files # want to keep this test the same. So adding all the old required files. expected_bucket_items = expected_bucket_items + [ 'measurement.csv', 'procedure_occurrence.csv', 'drug_exposure.csv', 'condition_occurrence.csv', 'visit_occurrence.csv' ] expected_bucket_items = [ prefix + item for item in expected_bucket_items ] list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) actual_bucket_items = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items)) # result says file found, parsed, loaded actual_result = test_util.read_cloud_file( self.hpo_bucket, prefix + common.RESULT_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list( actual_result_file) expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items) self.assertTrue( main.all_required_files_loaded(test_util.FAKE_HPO_ID, folder_prefix=prefix))
def test_five_person_data_retraction(self): folder_prefix = 'dummy-prefix-2018-03-22/' pid = 17 expected_result = {} for file_path in test_util.FIVE_PERSONS_FILES: # generate results files file_name = file_path.split('/')[-1] expected_result[file_name] = [] with open(file_path) as f: for line in f: line = line.strip() if (file_name in rd.PID_IN_COL1 and rd.get_integer(line.split(",")[0]) != pid) or \ (file_name in rd.PID_IN_COL2 and rd.get_integer(line.split(",")[1]) != pid): expected_result[file_name].append(line) # write file to cloud for testing test_util.write_cloud_file(self.hpo_bucket, file_path, prefix=folder_prefix) with mock.patch('__builtin__.raw_input', return_value='Y') as _raw_input: retract_result = rd.run_retraction(pid, self.hpo_bucket, folder=folder_prefix, force=True) actual_result = {} for file_path in test_util.FIVE_PERSONS_FILES: file_name = file_path.split('/')[-1] actual_result_contents = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + file_name) # convert to list and remove last list item since it is a newline actual_result[file_name] = actual_result_contents.split('\n')[:-1] for key in expected_result.keys(): self.assertListEqual(expected_result[key], actual_result[key]) # metadata for each updated file is returned # TODO test that files lacking records for PID are not updated self.assertEqual(len(retract_result[folder_prefix]), len(expected_result.keys()))
def test_pii_files_loaded(self, mock_check_cron): # tests if pii files are loaded folder_prefix = 'dummy-prefix-2018-03-22/' expected_result_items = resources._csv_to_list( test_util.PII_FILE_LOAD_RESULT_CSV) test_util.write_cloud_file(self.hpo_bucket, test_util.PII_NAME_FILE, prefix=folder_prefix) test_util.write_cloud_file(self.hpo_bucket, test_util.PII_MRN_BAD_PERSON_ID_FILE, prefix=folder_prefix) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.RESULT_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list( actual_result_file) # sort in order to compare expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items)