Ejemplo n.º 1
0
    def _load_datasets(self):
        """
        Load five persons data for each test hpo
        # expected_tables is for testing output
        # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
        """
        expected_tables = dict()
        running_jobs = []
        for cdm_table in resources.CDM_TABLES:
            output_table = ehr_union.output_table_for(cdm_table)
            expected_tables[output_table] = []
            for hpo_id in self.hpo_ids:
                # upload csv into hpo bucket
                if hpo_id == NYC_HPO_ID:
                    cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                                 cdm_table + '.csv')
                else:
                    cdm_file_name = os.path.join(
                        test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv')
                bucket = gcs_utils.get_hpo_bucket(hpo_id)
                if os.path.exists(cdm_file_name):
                    test_util.write_cloud_file(bucket, cdm_file_name)
                    csv_rows = resources.csv_to_list(cdm_file_name)
                else:
                    # results in empty table
                    test_util.write_cloud_str(bucket, cdm_table + '.csv',
                                              'dummy\n')
                    csv_rows = []
                # load table from csv
                result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
                running_jobs.append(result['jobReference']['jobId'])
                expected_tables[output_table] += list(csv_rows)
        # ensure person to observation output is as expected
        output_table_person = ehr_union.output_table_for(
            combine_ehr_rdr.PERSON_TABLE)
        output_table_observation = ehr_union.output_table_for(
            combine_ehr_rdr.OBSERVATION_TABLE)
        expected_tables[output_table_observation] += 4 * expected_tables[
            output_table_person]

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        if len(incomplete_jobs) > 0:
            message = "Job id(s) %s failed to complete" % incomplete_jobs
            raise RuntimeError(message)
        self.expected_tables = expected_tables
Ejemplo n.º 2
0
 def test_bad_file_names(self):
     bad_file_names = [
         "avisit_occurrence.csv",
         "condition_occurence.csv",  # misspelled
         "person_final.csv",
         "procedure_occurrence.tsv"
     ]  # unsupported file extension
     expected_warnings = []
     for file_name in bad_file_names:
         test_util.write_cloud_str(self.hpo_bucket,
                                   self.folder_prefix + file_name, ".")
         expected_item = (file_name, common.UNKNOWN_FILE)
         expected_warnings.append(expected_item)
     bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
     folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
     r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items,
                                  self.folder_prefix)
     self.assertCountEqual(expected_warnings, r['warnings'])
Ejemplo n.º 3
0
    def _load_dataset(self, hpo_id):
        for cdm_table in resources.CDM_TABLES:
            cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                         cdm_table + '.csv')
            if os.path.exists(cdm_file_name):
                test_util.write_cloud_file(self.hpo_bucket, cdm_file_name)
            else:
                test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv',
                                          'dummy\n')
            bq_utils.load_cdm_csv(hpo_id, cdm_table)

        # ensure concept table exists
        if not bq_utils.table_exists(common.CONCEPT):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
            q = """INSERT INTO {dataset}.concept
            SELECT * FROM {vocab}.concept""".format(
                dataset=self.dataset, vocab=common.VOCABULARY_DATASET)
            bq_utils.query(q)
Ejemplo n.º 4
0
    def _upload_file_to_bucket(bucket, dataset_id, path, table):
        app_id = bq_utils.app_identity.get_application_id()
        filename = table + '.csv'

        file_path = os.path.join(path, filename)
        try:
            with open(file_path, 'rb') as filepath:
                gcs_utils.upload_object(bucket, filename, filepath)
        except OSError:
            test_util.write_cloud_str(bucket, filename, '\n')

        gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket,
                                                     filename=filename)
        load_results = bq_utils.load_csv(table,
                                         gcs_path,
                                         app_id,
                                         dataset_id,
                                         table,
                                         allow_jagged_rows=True)
        load_job_id = load_results['jobReference']['jobId']
        return load_job_id