Exemple #1
0
    def test_copy_five_persons(self, mock_check_cron):
        # upload all five_persons files
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=self.folder_prefix)
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=self.folder_prefix +
                                       self.folder_prefix)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.COPY_HPO_FILES_URL)
            prefix = test_util.FAKE_HPO_ID + '/' + self.hpo_bucket + '/' + self.folder_prefix
            expected_bucket_items = [
                prefix + item.split(os.sep)[-1]
                for item in test_util.FIVE_PERSONS_FILES
            ]
            expected_bucket_items.extend([
                prefix + self.folder_prefix + item.split(os.sep)[-1]
                for item in test_util.FIVE_PERSONS_FILES
            ])

            list_bucket_result = gcs_utils.list_bucket(
                gcs_utils.get_drc_bucket())
            actual_bucket_items = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_bucket_items),
                                set(actual_bucket_items))
Exemple #2
0
    def test_pii_files_loaded(self, mock_check_cron):
        # tests if pii files are loaded
        test_file_paths = [
            test_util.PII_NAME_FILE, test_util.PII_MRN_BAD_PERSON_ID_FILE
        ]
        test_file_names = [os.path.basename(f) for f in test_file_paths]
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_NAME_FILE,
                                   prefix=self.folder_prefix)
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_MRN_BAD_PERSON_ID_FILE,
                                   prefix=self.folder_prefix)

        rs = resources.csv_to_list(test_util.PII_FILE_LOAD_RESULT_CSV)
        expected_results = [(r['file_name'], int(r['found']), int(r['parsed']),
                             int(r['loaded'])) for r in rs]
        for f in common.SUBMISSION_FILES:
            if f not in test_file_names:
                expected_result = (f, 0, 0, 0)
                expected_results.append(expected_result)

        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items,
                                     self.folder_prefix)
        self.assertSetEqual(set(expected_results), set(r['results']))
    def test_integration_five_person_data_retraction(self, mock_hpo_bucket,
                                                     mock_bucket,
                                                     mock_extract_pids):
        mock_hpo_bucket.return_value = self.site_bucket
        mock_bucket.return_value = self.bucket
        mock_extract_pids.return_value = self.pids
        lines_to_remove = {}
        total_lines_prior = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            # generate results files
            file_name = file_path.split('/')[-1]
            table_name = file_name.split('.')[0]
            lines_to_remove[file_name] = 0
            total_lines_prior[file_name] = 0
            with open(file_path) as f:
                # skip header
                next(f)
                for line in f:
                    line = line.strip()
                    if line != '':
                        if (table_name in rd.PID_IN_COL1 and int(line.split(",")[0]) in self.pids) or \
                                (table_name in rd.PID_IN_COL2 and int(line.split(",")[1]) in self.pids):
                            lines_to_remove[file_name] += 1
                        total_lines_prior[file_name] += 1

            # write file to cloud for testing
            test_util.write_cloud_file(self.bucket,
                                       file_path,
                                       prefix=self.folder_prefix_1)
            test_util.write_cloud_file(self.bucket,
                                       file_path,
                                       prefix=self.folder_prefix_2)

        retract_result = rd.run_gcs_retraction(self.project_id,
                                               self.sandbox_dataset_id,
                                               self.pid_table_id,
                                               self.hpo_id,
                                               folder='all_folders',
                                               force_flag=True)

        total_lines_post = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            file_name = file_path.split('/')[-1]
            actual_result_contents = test_util.read_cloud_file(
                self.bucket, self.folder_prefix_1 + file_name)
            # convert to list and remove header and last list item since it is a newline
            total_lines_post[file_name] = len(
                actual_result_contents.split('\n')[1:-1])

        for key in total_lines_prior.keys():
            if key in lines_to_remove:
                self.assertEqual(
                    lines_to_remove[key],
                    total_lines_prior[key] - total_lines_post[key])
            else:
                self.assertEqual(total_lines_prior[key], total_lines_post[key])

        # metadata for each updated file is returned
        self.assertEqual(len(retract_result[self.folder_prefix_1]),
                         len(lines_to_remove.keys()))
Exemple #4
0
    def test_validate_five_persons_success(self, mock_check_cron):
        expected_results = []
        test_file_names = [
            os.path.basename(f) for f in test_util.FIVE_PERSONS_FILES
        ]

        for cdm_file in common.SUBMISSION_FILES:
            if cdm_file in test_file_names:
                expected_result = (cdm_file, 1, 1, 1)
                test_file = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_file)
                test_util.write_cloud_file(self.hpo_bucket,
                                           test_file,
                                           prefix=self.folder_prefix)
            else:
                expected_result = (cdm_file, 0, 0, 0)
            expected_results.append(expected_result)
        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items,
                                     self.folder_prefix)
        self.assertSetEqual(set(r['results']), set(expected_results))

        # check tables exist and are clustered as expected
        for table in resources.CDM_TABLES + common.PII_TABLES:
            fields_file = os.path.join(resources.fields_path, table + '.json')
            table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table)
            table_info = bq_utils.get_table_info(table_id)
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self.table_has_clustering(table_info)
Exemple #5
0
 def _test_html_report_five_person(self, mock_check_cron):
     # Not sure this test is still relevant (see hpo_report module and tests)
     # TODO refactor or remove this test
     folder_prefix = '2019-01-01/'
     for cdm_file in test_util.FIVE_PERSONS_FILES:
         test_util.write_cloud_file(self.hpo_bucket,
                                    cdm_file,
                                    prefix=folder_prefix)
     # achilles sometimes fails due to rate limits.
     # using both success and failure cases allow it to fail gracefully until there is a fix for achilles
     with open(test_util.FIVE_PERSON_RESULTS_FILE, 'r') as f:
         expected_result_achilles_success = self._remove_timestamp_tags_from_results(
             f.read())
     with open(test_util.FIVE_PERSON_RESULTS_ACHILLES_ERROR_FILE, 'r') as f:
         expected_result_achilles_failure = self._remove_timestamp_tags_from_results(
             f.read())
     expected_results = [
         expected_result_achilles_success, expected_result_achilles_failure
     ]
     main.app.testing = True
     with main.app.test_client() as c:
         c.get(test_util.VALIDATE_HPO_FILES_URL)
         actual_result = test_util.read_cloud_file(
             self.hpo_bucket, folder_prefix + common.RESULTS_HTML)
         actual_result_file = self._remove_timestamp_tags_from_results(
             StringIO(actual_result).getvalue())
         self.assertIn(actual_result_file, expected_results)
Exemple #6
0
 def _load_dataset(self):
     for cdm_table in resources.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         if os.path.exists(cdm_file_name):
             test_util.write_cloud_file(self.hpo_bucket, cdm_file_name)
         else:
             test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv',
                                       'dummy\n')
         bq_utils.load_cdm_csv(FAKE_HPO_ID, cdm_table)
Exemple #7
0
    def _load_datasets(self):
        """
        Load five persons data for each test hpo
        # expected_tables is for testing output
        # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
        """
        expected_tables = dict()
        running_jobs = []
        for cdm_table in resources.CDM_TABLES:
            output_table = ehr_union.output_table_for(cdm_table)
            expected_tables[output_table] = []
            for hpo_id in self.hpo_ids:
                # upload csv into hpo bucket
                if hpo_id == NYC_HPO_ID:
                    cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                                 cdm_table + '.csv')
                else:
                    cdm_file_name = os.path.join(
                        test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv')
                bucket = gcs_utils.get_hpo_bucket(hpo_id)
                if os.path.exists(cdm_file_name):
                    test_util.write_cloud_file(bucket, cdm_file_name)
                    csv_rows = resources.csv_to_list(cdm_file_name)
                else:
                    # results in empty table
                    test_util.write_cloud_str(bucket, cdm_table + '.csv',
                                              'dummy\n')
                    csv_rows = []
                # load table from csv
                result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
                running_jobs.append(result['jobReference']['jobId'])
                expected_tables[output_table] += list(csv_rows)
        # ensure person to observation output is as expected
        output_table_person = ehr_union.output_table_for(
            combine_ehr_rdr.PERSON_TABLE)
        output_table_observation = ehr_union.output_table_for(
            combine_ehr_rdr.OBSERVATION_TABLE)
        expected_tables[output_table_observation] += 4 * expected_tables[
            output_table_person]

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        if len(incomplete_jobs) > 0:
            message = "Job id(s) %s failed to complete" % incomplete_jobs
            raise RuntimeError(message)
        self.expected_tables = expected_tables
Exemple #8
0
    def _load_dataset(self, hpo_id):
        for cdm_table in resources.CDM_TABLES:
            cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                         cdm_table + '.csv')
            if os.path.exists(cdm_file_name):
                test_util.write_cloud_file(self.hpo_bucket, cdm_file_name)
            else:
                test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv',
                                          'dummy\n')
            bq_utils.load_cdm_csv(hpo_id, cdm_table)

        # ensure concept table exists
        if not bq_utils.table_exists(common.CONCEPT):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
            q = """INSERT INTO {dataset}.concept
            SELECT * FROM {vocab}.concept""".format(
                dataset=self.dataset, vocab=common.VOCABULARY_DATASET)
            bq_utils.query(q)
Exemple #9
0
    def load_test_data(self, hpo_id=None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """

        table_name = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id = bq_utils.get_table_id(hpo_id, table_name)
        else:
            table_id = table_name
        test_file_name = table_id + '.csv'
        test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name)
        test_util.write_cloud_file(self.bucket, test_file_path)
        gcs_path = 'gs://' + self.bucket + '/' + test_file_name
        load_results = bq_utils.load_csv(table_name, gcs_path, self.app_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources.csv_to_list(test_file_path)
Exemple #10
0
    def test_html_report_five_person(self, mock_check_cron, mock_first_run,
                                     mock_rdr_date, mock_required_files_loaded):
        mock_required_files_loaded.return_value = False
        mock_first_run.return_value = False
        rdr_date = '2020-01-01'
        mock_rdr_date.return_value = rdr_date
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=self.folder_prefix)
        # load person table in RDR
        bq_utils.load_table_from_csv(self.project_id, self.rdr_dataset_id,
                                     common.PERSON,
                                     test_util.FIVE_PERSONS_PERSON_CSV)

        # Load measurement_concept_sets
        required_labs.load_measurement_concept_sets_table(
            project_id=self.project_id, dataset_id=self.bigquery_dataset_id)
        # Load measurement_concept_sets_descendants
        required_labs.load_measurement_concept_sets_descendants_table(
            project_id=self.project_id, dataset_id=self.bigquery_dataset_id)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, self.folder_prefix + common.RESULTS_HTML)

        # ensure emails are not sent
        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        self.assertFalse(main.is_first_validation_run(folder_items))

        # parse html
        soup = bs(actual_result, parser="lxml", features="lxml")
        missing_pii_html_table = soup.find('table', id='missing_pii')
        table_headers = missing_pii_html_table.find_all('th')
        self.assertEqual('Missing Participant Record Type',
                         table_headers[0].get_text())
        self.assertEqual('Count', table_headers[1].get_text())

        table_rows = missing_pii_html_table.find_next('tbody').find_all('tr')
        missing_record_types = [
            table_row.find('td').text for table_row in table_rows
        ]
        self.assertIn(main_consts.EHR_NO_PII, missing_record_types)
        self.assertIn(main_consts.PII_NO_EHR, missing_record_types)
        self.assertIn(main_consts.EHR_NO_RDR.format(date=rdr_date),
                      missing_record_types)
        self.assertIn(main_consts.EHR_NO_PARTICIPANT_MATCH,
                      missing_record_types)

        required_lab_html_table = soup.find('table', id='required-lab')
        table_headers = required_lab_html_table.find_all('th')
        self.assertEqual(3, len(table_headers))
        self.assertEqual('Ancestor Concept ID', table_headers[0].get_text())
        self.assertEqual('Ancestor Concept Name', table_headers[1].get_text())
        self.assertEqual('Found', table_headers[2].get_text())

        table_rows = required_lab_html_table.find_next('tbody').find_all('tr')
        table_rows_last_column = [
            table_row.find_all('td')[-1] for table_row in table_rows
        ]
        submitted_labs = [
            row for row in table_rows_last_column
            if 'result-1' in row.attrs['class']
        ]
        missing_labs = [
            row for row in table_rows_last_column
            if 'result-0' in row.attrs['class']
        ]
        self.assertTrue(len(table_rows) > 0)
        self.assertTrue(len(submitted_labs) > 0)
        self.assertTrue(len(missing_labs) > 0)