def test_all_files_unparseable_output(self, mock_check_cron):
        # TODO possible bug: if no pre-existing table, results in bq table not found error
        folder_prefix = 'dummy-prefix-2018-03-22/'
        for cdm_table in common.CDM_FILES:
            test_util.write_cloud_str(self.hpo_bucket,
                                      folder_prefix + cdm_table, ".\n .")

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            bucket_item_names = [
                item['name'] for item in list_bucket_result
                if item['name'].startswith(folder_prefix)
            ]
            expected_items = common.CDM_FILES + common.IGNORE_LIST
            expected_items = [
                folder_prefix + item_name for item_name in expected_items
            ]
            self.assertSetEqual(set(bucket_item_names), set(expected_items))

            # check content of the file is correct
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, folder_prefix + common.RESULT_CSV)
            actual_result = resources._csv_file_to_list(
                StringIO.StringIO(actual_result))
            expected = [{
                'cdm_file_name': cdm_file_name,
                'found': '1',
                'parsed': '0',
                'loaded': '0'
            } for cdm_file_name in common.CDM_FILES]
            self.assertEqual(expected, actual_result)
 def _load_datasets(self):
     """
     Load five persons data for each test hpo
     """
     # expected_tables is for testing output
     # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
     expected_tables = dict()
     running_jobs = []
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         output_table = ehr_union.output_table_for(cdm_table)
         expected_tables[output_table] = []
         for hpo_id in self.hpo_ids:
             # upload csv into hpo bucket
             bucket = gcs_utils.get_hpo_bucket(hpo_id)
             if os.path.exists(cdm_file_name):
                 test_util.write_cloud_file(bucket, cdm_file_name)
                 csv_rows = resources._csv_to_list(cdm_file_name)
             else:
                 # results in empty table
                 test_util.write_cloud_str(bucket, cdm_table + '.csv',
                                           'dummy\n')
                 csv_rows = []
             # load table from csv
             result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
             running_jobs.append(result['jobReference']['jobId'])
             expected_tables[output_table] += list(csv_rows)
     incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     self.expected_tables = expected_tables
Exemple #3
0
    def test_bad_file_names(self, mock_check_cron):
        folder_prefix = 'dummy-prefix-2018-03-22/'
        exclude_file_list = ["person_final.csv",
                             "condition_occurence.csv",  # misspelled
                             "avisit_occurrence.csv",
                             "procedure_occurrence.tsv"]  # unsupported file extension

        exclude_file_list = [folder_prefix + item for item in exclude_file_list]
        expected_result_items = []
        for file_name in exclude_file_list:
            test_util.write_cloud_str(self.hpo_bucket, file_name, ".")
            expected_item = dict(file_name=file_name.split('/')[1], message=main.UNKNOWN_FILE)
            expected_result_items.append(expected_item)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check content of the bucket is correct
            expected_bucket_items = exclude_file_list + [folder_prefix + item for item in common.IGNORE_LIST]
            # [common.RESULT_CSV, common.WARNINGS_CSV]
            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            actual_bucket_items = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items))

            # check content of the warnings file is correct
            actual_result = test_util.read_cloud_file(self.hpo_bucket,
                                                      folder_prefix + common.WARNINGS_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(actual_result_file)
            # sort in order to compare
            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)
    def test_errors_csv(self, mock_check_cron):
        folder_prefix = 'dummy-prefix-2018-03-22/'
        test_util.write_cloud_str(self.hpo_bucket,
                                  folder_prefix + 'person.csv', ".\n .,.,.")

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            bucket_item_names = [
                item['name'] for item in list_bucket_result
                if item['name'].startswith(folder_prefix)
            ]
            expected_items = ['person.csv'] + common.IGNORE_LIST
            expected_items = [folder_prefix + item for item in expected_items]
            self.assertSetEqual(set(bucket_item_names), set(expected_items))

            # check content of the file is correct
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, folder_prefix + common.ERRORS_CSV)
            with open(test_util.BAD_PERSON_FILE_BQ_LOAD_ERRORS_CSV, 'r') as f:
                expected = f.read()
                self.assertEqual(expected, actual_result)
 def load_dataset_from_files(dataset_id, path):
     app_id = bq_utils.app_identity.get_application_id()
     bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     test_util.empty_bucket(bucket)
     job_ids = []
     for table in common.CDM_TABLES:
         filename = table + '.csv'
         schema = os.path.join(resources.fields_path, table + '.json')
         f = os.path.join(path, filename)
         if os.path.exists(os.path.join(path, filename)):
             with open(f, 'r') as fp:
                 gcs_utils.upload_object(bucket, filename, fp)
         else:
             test_util.write_cloud_str(bucket, filename, '\n')
         gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket,
                                                      filename=filename)
         load_results = bq_utils.load_csv(schema,
                                          gcs_path,
                                          app_id,
                                          dataset_id,
                                          table,
                                          allow_jagged_rows=True)
         load_job_id = load_results['jobReference']['jobId']
         job_ids.append(load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     test_util.empty_bucket(bucket)
 def _load_datasets(self):
     load_jobs = []
     self.expected_tables = dict()
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         result_table = ehr_merge.result_table_for(cdm_table)
         if os.path.exists(cdm_file_name):
             # one copy for chs, the other for pitt
             csv_rows = resources._csv_to_list(cdm_file_name)
             self.expected_tables[result_table] = csv_rows + list(csv_rows)
             test_util.write_cloud_file(self.chs_bucket, cdm_file_name)
             test_util.write_cloud_file(self.pitt_bucket, cdm_file_name)
         else:
             self.expected_tables[result_table] = []
             test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv',
                                       'dummy\n')
             test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv',
                                       'dummy\n')
         chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table)
         pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table)
         chs_load_job_id = chs_load_results['jobReference']['jobId']
         pitt_load_job_id = pitt_load_results['jobReference']['jobId']
         load_jobs.append(chs_load_job_id)
         load_jobs.append(pitt_load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(load_jobs)
     if len(incomplete_jobs) > 0:
         raise RuntimeError('BigQuery jobs %s failed to complete' %
                            incomplete_jobs)
Exemple #7
0
    def test_errors_csv(self, mock_check_cron):
        folder_prefix = 'dummy-prefix-2018-03-22/'
        test_util.write_cloud_str(self.hpo_bucket,
                                  folder_prefix + 'person.csv', ".\n .,.,.")

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            bucket_item_names = [
                item['name'] for item in list_bucket_result
                if item['name'].startswith(folder_prefix)
            ]
            expected_items = ['person.csv'] + common.IGNORE_LIST
            expected_items = [folder_prefix + item for item in expected_items]
            self.assertSetEqual(set(bucket_item_names), set(expected_items))

            # check content of the file is correct
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, folder_prefix + common.ERRORS_CSV)
            actual = resources._csv_file_to_list(
                StringIO.StringIO(actual_result))
            for row in actual:
                row.pop('message', None)
            expected = [{'file_name': 'person.csv', 'type': 'error'}]
            self.assertEqual(actual, expected)
Exemple #8
0
 def _load_dataset(self):
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv')
         if os.path.exists(cdm_file_name):
             test_util.write_cloud_file(self.hpo_bucket, cdm_file_name)
         else:
             test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv', 'dummy\n')
         bq_utils.load_cdm_csv(FAKE_HPO_ID, cdm_table)
Exemple #9
0
    def test_check_processed(self):
        folder_prefix = 'folder/'
        test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'person.csv', '\n')
        test_util.write_cloud_str(self.hpo_bucket, folder_prefix + common.PROCESSED_TXT, '\n')

        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        result = main._get_to_process_list(self.hpo_bucket, bucket_items, force_process=False)
        self.assertListEqual([], result)
        result = main._get_to_process_list(self.hpo_bucket, bucket_items, force_process=True)
        self.assertListEqual(result, [folder_prefix])
Exemple #10
0
    def test_pii_files_ignore(self, mock_check_cron):
        folder_prefix = 'dummy-prefix-2018-03-22/'
        test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'pii_person.csv', contents_str='.')

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)
            actual_result = test_util.read_cloud_file(self.hpo_bucket, folder_prefix + common.WARNINGS_CSV)
            with open(test_util.EMPTY_WARNINGS_CSV, 'r') as f:
                expected = f.read()
                self.assertEqual(expected, actual_result)
Exemple #11
0
    def test_latest_folder_validation(self, mock_check_cron):
        folder_prefix_1 = 'dummy-prefix-2018-03-22-v1/'
        folder_prefix_2 = 'dummy-prefix-2018-03-22-v2/'
        folder_prefix_3 = 'dummy-prefix-2018-03-22-v3/'
        exclude_file_list = [folder_prefix_1 + 'person.csv',
                             folder_prefix_2 + 'blah.csv',
                             folder_prefix_3 + 'visit_occurrence.csv']
        for filename in exclude_file_list:
            test_util.write_cloud_str(self.hpo_bucket, filename, ".\n .")

        main.app.testing = True
        with main.app.test_client() as c:
            return_string = c.get(test_util.VALIDATE_HPO_FILES_URL).data
Exemple #12
0
 def _upload_file_to_bucket(bucket, dataset_id, path, table):
     app_id = bq_utils.app_identity.get_application_id()
     filename = table + '.csv'
     schema = os.path.join(resources.fields_path, table + '.json')
     f = os.path.join(path, filename)
     if os.path.exists(os.path.join(path, filename)):
         with open(f, 'r') as fp:
             gcs_utils.upload_object(bucket, filename, fp)
     else:
         test_util.write_cloud_str(bucket, filename, '\n')
     gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename)
     load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True)
     load_job_id = load_results['jobReference']['jobId']
     return load_job_id
    def test_folder_list(self):
        folder_prefix_1 = 'dummy-prefix-2018-03-22-v1/'
        folder_prefix_2 = 'dummy-prefix-2018-03-22-v2/'
        folder_prefix_3 = 'dummy-prefix-2018-03-22-v3/'
        file_list = [
            folder_prefix_1 + 'person.csv', folder_prefix_2 + 'blah.csv',
            folder_prefix_3 + 'visit_occurrence.csv', 'person.csv'
        ]

        for filename in file_list:
            test_util.write_cloud_str(self.hpo_bucket, filename, ".\n .")

        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_list = main._get_to_process_list(self.hpo_bucket, bucket_items)
        self.assertListEqual(folder_list, [folder_prefix_3])
Exemple #14
0
 def _load_datasets(self):
     load_jobs = []
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         if os.path.exists(cdm_file_name):
             test_util.write_cloud_file(self.chs_bucket, cdm_file_name)
             test_util.write_cloud_file(self.pitt_bucket, cdm_file_name)
         else:
             test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv',
                                       'dummy\n')
             test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv',
                                       'dummy\n')
         chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table)
         pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table)
         chs_load_job_id = chs_load_results['jobReference']['jobId']
         pitt_load_job_id = pitt_load_results['jobReference']['jobId']
         load_jobs.append(chs_load_job_id)
         load_jobs.append(pitt_load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(load_jobs)
     if len(incomplete_jobs) > 0:
         raise RuntimeError('BigQuery jobs %s failed to complete' %
                            incomplete_jobs)
Exemple #15
0
    def test_validation_done_folder(self, mock_check_cron):
        folder_prefix_v1 = 'dummy-prefix-2018-03-22-v1/'
        folder_prefix = 'dummy-prefix-2018-03-22/'

        # upload all five_persons files
        test_util.write_cloud_str(self.hpo_bucket, folder_prefix_v1 + 'person.csv', contents_str='.')
        test_util.write_cloud_str(self.hpo_bucket, folder_prefix + 'person.csv', contents_str='.')
        test_util.write_cloud_str(self.hpo_bucket, folder_prefix + common.PROCESSED_TXT, contents_str='.')

        main.app.testing = True
        with main.app.test_client() as c:
            return_string = c.get(test_util.VALIDATE_HPO_FILES_URL).data
            self.assertFalse(folder_prefix in return_string)
            self.assertFalse(folder_prefix_v1 in return_string)