Beispiel #1
0
 def setUpClass(cls):
     cls.testbed = testbed.Testbed()
     cls.testbed.activate()
     cls.testbed.init_app_identity_stub()
     cls.testbed.init_memcache_stub()
     cls.testbed.init_urlfetch_stub()
     cls.testbed.init_blobstore_stub()
     cls.testbed.init_datastore_v3_stub()
     fake_bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     dataset_id = bq_utils.get_dataset_id()
     test_util.delete_all_tables(dataset_id)
     test_util.get_synpuf_results_files()
     test_util.populate_achilles(fake_bucket)
Beispiel #2
0
 def load_dataset_from_files(dataset_id, path, mappings=False):
     bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     test_util.empty_bucket(bucket)
     job_ids = []
     for table in resources.CDM_TABLES:
         job_ids.append(CombineEhrRdrTest._upload_file_to_bucket(bucket, dataset_id, path, table))
         if mappings and table in DOMAIN_TABLES:
             mapping_table = '_mapping_{table}'.format(table=table)
             job_ids.append(CombineEhrRdrTest._upload_file_to_bucket(bucket, dataset_id, path, mapping_table))
     incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     test_util.empty_bucket(bucket)
Beispiel #3
0
def process_hpo(hpo_id, force_run=False):
    """
    runs validation for a single hpo_id

    :param hpo_id: which hpo_id to run for
    :param force_run: if True, process the latest submission whether or not it
        has already been processed before
    :raises
    BucketDoesNotExistError:
      Raised when a configured bucket does not exist
    InternalValidationError:
      Raised when an internal error is encountered during validation
    """
    try:
        logging.info(f"Processing hpo_id {hpo_id}")
        bucket = gcs_utils.get_hpo_bucket(hpo_id)
        bucket_items = list_bucket(bucket)
        folder_prefix = _get_submission_folder(bucket, bucket_items, force_run)
        if folder_prefix is None:
            logging.info(
                f"No submissions to process in {hpo_id} bucket {bucket}")
        else:
            folder_items = []
            if is_valid_folder_prefix_name(folder_prefix):
                # perform validation
                folder_items = get_folder_items(bucket_items, folder_prefix)
                summary = validate_submission(hpo_id, bucket, folder_items,
                                              folder_prefix)
                report_data = generate_metrics(hpo_id, bucket, folder_prefix,
                                               summary)
            else:
                # do not perform validation
                report_data = generate_empty_report(hpo_id, folder_prefix)
            perform_reporting(hpo_id, report_data, folder_items, bucket,
                              folder_prefix)
    except BucketDoesNotExistError as bucket_error:
        bucket = bucket_error.bucket
        # App engine converts an env var set but left empty to be the string 'None'
        if bucket and bucket.lower() != 'none':
            logging.warning(
                f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' does not exist"
            )
        else:
            logging.info(
                f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' is empty/unset"
            )
    except HttpError as http_error:
        message = (f"Failed to process hpo_id '{hpo_id}' due to the following "
                   f"HTTP error: {http_error.content.decode()}")
        logging.exception(message)
    def test_merge_with_unmatched_schema(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'measurement.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'measurement')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_measurement', 'pitt_person'))

        table_names = ['nyc_measurement', 'pitt_person']
        success, error = bq_utils.merge_tables(
          bq_utils.get_dataset_id(),
          table_names,
          bq_utils.get_dataset_id(),
          'merged_nyc_pitt'
        )
        self.assertFalse(success)
 def setUp(self):
     self.hpo_id = test_util.FAKE_HPO_ID
     self.bucket = gcs_utils.get_hpo_bucket(self.hpo_id)
     self.site_bucket = 'test_bucket'
     self.folder_1 = '2019-01-01-v1/'
     self.folder_2 = '2019-02-02-v2/'
     self.folder_prefix_1 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_1
     self.folder_prefix_2 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_2
     self.pids = [17, 20]
     self.skip_pids = [10, 25]
     self.project_id = 'project_id'
     self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id()
     self.pid_table_id = 'pid_table'
     self._empty_bucket()
Beispiel #6
0
    def test_target_bucket_upload(self):
        bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
        folder_prefix = 'test-folder-fake/'
        test_util.empty_bucket(bucket_nyc)

        main._upload_achilles_files(hpo_id=None,
                                    folder_prefix=folder_prefix,
                                    target_bucket=bucket_nyc)
        actual_bucket_files = set(
            [item['name'] for item in gcs_utils.list_bucket(bucket_nyc)])
        expected_bucket_files = set([
            'test-folder-fake/' + item
            for item in resources.ALL_ACHILLES_INDEX_FILES
        ])
        self.assertSetEqual(expected_bucket_files, actual_bucket_files)
Beispiel #7
0
 def setUp(self):
     super(ValidationTest, self).setUp()
     self.testbed = testbed.Testbed()
     self.testbed.activate()
     self.testbed.init_app_identity_stub()
     self.testbed.init_memcache_stub()
     self.testbed.init_urlfetch_stub()
     self.testbed.init_blobstore_stub()
     self.testbed.init_datastore_v3_stub()
     self.hpo_id = test_util.FAKE_HPO_ID
     self.hpo_bucket = gcs_utils.get_hpo_bucket(self.hpo_id)
     self.bigquery_dataset_id = bq_utils.get_dataset_id()
     self.folder_prefix = '2019-01-01/'
     self._empty_bucket()
     test_util.delete_all_tables(self.bigquery_dataset_id)
     self._create_drug_class_table()
Beispiel #8
0
    def setUp(self):
        self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID)
        self.project_id = app_identity.get_application_id()
        self.dataset_id = bq_utils.get_dataset_id()
        self.rdr_dataset_id = bq_utils.get_rdr_dataset_id()
        self.folder_prefix = '2019-01-01/'
        test_util.delete_all_tables(self.dataset_id)
        test_util.empty_bucket(self.hpo_bucket)

        mock_get_hpo_name = mock.patch('validation.main.get_hpo_name')

        self.mock_get_hpo_name = mock_get_hpo_name.start()
        self.mock_get_hpo_name.return_value = 'Fake HPO'
        self.addCleanup(mock_get_hpo_name.stop)

        self._load_data()
Beispiel #9
0
    def setUp(self):
        self.hpo_id = test_util.FAKE_HPO_ID
        self.hpo_bucket = gcs_utils.get_hpo_bucket(self.hpo_id)
        self.project_id = app_identity.get_application_id()
        self.rdr_dataset_id = bq_utils.get_rdr_dataset_id()
        mock_get_hpo_name = mock.patch('validation.main.get_hpo_name')

        self.mock_get_hpo_name = mock_get_hpo_name.start()
        self.mock_get_hpo_name.return_value = 'Fake HPO'
        self.addCleanup(mock_get_hpo_name.stop)

        self.bigquery_dataset_id = bq_utils.get_dataset_id()
        self.folder_prefix = '2019-01-01-v1/'
        self._empty_bucket()
        test_util.delete_all_tables(self.bigquery_dataset_id)
        self._create_drug_class_table(self.bigquery_dataset_id)
Beispiel #10
0
def load_pii_csv(hpo_id, pii_table_name, source_folder_prefix=""):
    """
    Load PII file from a bucket into a table in bigquery
    :param hpo_id: ID for the HPO site
    :param pii_table_name: name of the CDM table
    :return: an object describing the associated bigquery job
    """
    if pii_table_name not in common.PII_TABLES:
        raise ValueError('{} is not a valid table to load'.format(pii_table_name))

    app_id = app_identity.get_application_id()
    dataset_id = get_dataset_id()
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    fields_filename = os.path.join(resources.fields_path, pii_table_name + '.json')
    gcs_object_path = 'gs://%s/%s%s.csv' % (bucket, source_folder_prefix, pii_table_name)
    table_id = get_table_id(hpo_id, pii_table_name)
    return load_csv(fields_filename, gcs_object_path, app_id, dataset_id, table_id)
Beispiel #11
0
    def _load_datasets(self):
        """
        Load five persons data for each test hpo
        # expected_tables is for testing output
        # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
        """
        expected_tables = dict()
        running_jobs = []
        for cdm_table in resources.CDM_TABLES:
            output_table = ehr_union.output_table_for(cdm_table)
            expected_tables[output_table] = []
            for hpo_id in self.hpo_ids:
                # upload csv into hpo bucket
                if hpo_id == NYC_HPO_ID:
                    cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                                 cdm_table + '.csv')
                else:
                    cdm_file_name = os.path.join(
                        test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv')
                bucket = gcs_utils.get_hpo_bucket(hpo_id)
                if os.path.exists(cdm_file_name):
                    test_util.write_cloud_file(bucket, cdm_file_name)
                    csv_rows = resources.csv_to_list(cdm_file_name)
                else:
                    # results in empty table
                    test_util.write_cloud_str(bucket, cdm_table + '.csv',
                                              'dummy\n')
                    csv_rows = []
                # load table from csv
                result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
                running_jobs.append(result['jobReference']['jobId'])
                expected_tables[output_table] += list(csv_rows)
        # ensure person to observation output is as expected
        output_table_person = ehr_union.output_table_for(
            combine_ehr_rdr.PERSON_TABLE)
        output_table_observation = ehr_union.output_table_for(
            combine_ehr_rdr.OBSERVATION_TABLE)
        expected_tables[output_table_observation] += 4 * expected_tables[
            output_table_person]

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        if len(incomplete_jobs) > 0:
            message = "Job id(s) %s failed to complete" % incomplete_jobs
            raise RuntimeError(message)
        self.expected_tables = expected_tables
 def setUp(self):
     self.testbed = testbed.Testbed()
     self.testbed.activate()
     self.testbed.init_app_identity_stub()
     self.testbed.init_memcache_stub()
     self.testbed.init_urlfetch_stub()
     self.testbed.init_blobstore_stub()
     self.testbed.init_datastore_v3_stub()
     self.hpo_id = test_util.FAKE_HPO_ID
     self.bucket = gcs_utils.get_hpo_bucket(self.hpo_id)
     self.site_bucket = 'test_bucket'
     self.folder_1 = '2019-01-01-v1/'
     self.folder_2 = '2019-02-02-v2/'
     self.folder_prefix_1 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_1
     self.folder_prefix_2 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_2
     self.pids = [17, 20]
     self.skip_pids = [10, 25]
     self._empty_bucket()
Beispiel #13
0
def _upload_achilles_files(hpo_id, folder_prefix):
    """uploads achilles web files to the corresponding hpo bucket

    :hpo_id: which hpo bucket do these files go into
    :returns:

    """
    results = []
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    for filename in common.ACHILLES_INDEX_FILES:
        logging.debug('Uploading achilles file `%s` to bucket `%s`' %
                      (filename, bucket))
        bucket_file_name = filename.split(resources.resource_path +
                                          os.sep)[1].strip()
        with open(filename, 'r') as fp:
            upload_result = gcs_utils.upload_object(
                bucket, folder_prefix + bucket_file_name, fp)
            results.append(upload_result)
    return results
Beispiel #14
0
def load_cdm_csv(hpo_id, cdm_table_name, source_folder_prefix="", dataset_id=None):
    """
    Load CDM file from a bucket into a table in bigquery
    :param hpo_id: ID for the HPO site
    :param cdm_table_name: name of the CDM table
    :return: an object describing the associated bigquery job
    """
    if cdm_table_name not in resources.CDM_TABLES:
        raise ValueError('{} is not a valid table to load'.format(cdm_table_name))

    app_id = app_identity.get_application_id()
    if dataset_id is None:
        dataset_id = get_dataset_id()
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    fields_filename = os.path.join(resources.fields_path, cdm_table_name + '.json')
    gcs_object_path = 'gs://%s/%s%s.csv' % (bucket, source_folder_prefix, cdm_table_name)
    table_id = get_table_id(hpo_id, cdm_table_name)
    allow_jagged_rows = cdm_table_name == 'observation'
    return load_csv(fields_filename, gcs_object_path, app_id, dataset_id, table_id, allow_jagged_rows=allow_jagged_rows)
Beispiel #15
0
 def test_run_export_with_target_bucket_and_hpo_id(self):
     folder_prefix = 'dummy-prefix-2018-03-24/'
     bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
     main.run_export(hpo_id=test_util.FAKE_HPO_ID, folder_prefix=folder_prefix, target_bucket=bucket_nyc)
     bucket_objects = gcs_utils.list_bucket(bucket_nyc)
     actual_object_names = [obj['name'] for obj in bucket_objects]
     for report in common.ALL_REPORT_FILES:
         prefix = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + test_util.FAKE_HPO_ID + '/'
         expected_object_name = prefix + report
         self.assertIn(expected_object_name, actual_object_names)
     datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON
     self.assertIn(datasources_json_path, actual_object_names)
     datasources_json = gcs_utils.get_object(bucket_nyc, datasources_json_path)
     datasources_actual = json.loads(datasources_json)
     datasources_expected = {
         'datasources': [
             {'name': test_util.FAKE_HPO_ID, 'folder': test_util.FAKE_HPO_ID, 'cdmVersion': 5}
         ]
     }
     self.assertDictEqual(datasources_expected, datasources_actual)
    def setUp(self):
        self.testbed = testbed.Testbed()
        self.testbed.activate()
        self.testbed.init_app_identity_stub()
        self.testbed.init_memcache_stub()
        self.testbed.init_urlfetch_stub()
        self.testbed.init_blobstore_stub()
        self.testbed.init_datastore_v3_stub()
        self.hpo_id = test_util.FAKE_HPO_ID
        self.hpo_bucket = gcs_utils.get_hpo_bucket(self.hpo_id)
        mock_get_hpo_name = mock.patch('validation.main.get_hpo_name')

        self.mock_get_hpo_name = mock_get_hpo_name.start()
        self.mock_get_hpo_name.return_value = 'Fake HPO'
        self.addCleanup(mock_get_hpo_name.stop)

        self.bigquery_dataset_id = bq_utils.get_dataset_id()
        self.folder_prefix = '2019-01-01/'
        self._empty_bucket()
        test_util.delete_all_tables(self.bigquery_dataset_id)
        self._create_drug_class_table()
Beispiel #17
0
def copy_files(hpo_id):
    """copies over files from hpo bucket to drc bucket

    :hpo_id: hpo from which to copy

    """
    hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)
    drc_private_bucket = gcs_utils.get_drc_bucket()

    bucket_items = gcs_utils.list_bucket(hpo_bucket)

    prefix = hpo_id + '/' + hpo_bucket + '/'

    for item in bucket_items:
        item_name = item['name']
        gcs_utils.copy_object(source_bucket=hpo_bucket,
                              source_object_id=item_name,
                              destination_bucket=drc_private_bucket,
                              destination_object_id=prefix + item_name)

    return '{"copy-status": "done"}'
Beispiel #18
0
def run_export(hpo_id=None, folder_prefix="", target_bucket=None):
    """
    Run export queries for an HPO and store JSON payloads in specified folder in (optional) target bucket

    :type hpo_id: ID of the HPO to run export for. This is the data source name in the report.
    :param folder_prefix: Relative base path to store report. empty by default.
    :param target_bucket: Bucket to save report. If None, use bucket associated with hpo_id.
    """
    results = []

    # Using separate var rather than hpo_id here because hpo_id None needed in calls below
    datasource_name = 'default'
    if hpo_id is None:
        if target_bucket is None:
            raise RuntimeError(
                'Cannot export if neither hpo_id or target_bucket is specified.'
            )
    else:
        datasource_name = hpo_id
        if target_bucket is None:
            target_bucket = gcs_utils.get_hpo_bucket(hpo_id)

    logging.info('Exporting %s report to bucket %s', datasource_name,
                 target_bucket)

    # Run export queries and store json payloads in specified folder in the target bucket
    reports_prefix = folder_prefix + ACHILLES_EXPORT_PREFIX_STRING + datasource_name + '/'
    for export_name in common.ALL_REPORTS:
        sql_path = os.path.join(export.EXPORT_PATH, export_name)
        result = export.export_from_path(sql_path, hpo_id)
        content = json.dumps(result)
        fp = StringIO(content)
        result = gcs_utils.upload_object(
            target_bucket, reports_prefix + export_name + '.json', fp)
        results.append(result)
    result = save_datasources_json(hpo_id=hpo_id,
                                   folder_prefix=folder_prefix,
                                   target_bucket=target_bucket)
    results.append(result)
    return results
Beispiel #19
0
    def test_run_export_with_target_bucket(self):
        folder_prefix = 'dummy-prefix-2018-03-24/'
        bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
        test_util.get_synpuf_results_files()
        test_util.populate_achilles(self.hpo_bucket, hpo_id=None)
        main.run_export(folder_prefix=folder_prefix, target_bucket=bucket_nyc)
        bucket_objects = gcs_utils.list_bucket(bucket_nyc)
        actual_object_names = [obj['name'] for obj in bucket_objects]
        for report in common.ALL_REPORT_FILES:
            expected_object_name = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + 'default' + '/' + report
            self.assertIn(expected_object_name, actual_object_names)

        datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON
        self.assertIn(datasources_json_path, actual_object_names)
        datasources_json = gcs_utils.get_object(bucket_nyc, datasources_json_path)
        datasources_actual = json.loads(datasources_json)
        datasources_expected = {
            'datasources': [
                {'name': 'default', 'folder': 'default', 'cdmVersion': 5}
            ]
        }
        self.assertDictEqual(datasources_expected, datasources_actual)
Beispiel #20
0
def process_hpo(hpo_id, force_run=False):
    """
    runs validation for a single hpo_id

    :param hpo_id: which hpo_id to run for
    :param force_run: if True, process the latest submission whether or not it
        has already been processed before
    :raises
    BucketDoesNotExistError:
      Raised when a configured bucket does not exist
    InternalValidationError:
      Raised when an internal error is encountered during validation
    """
    try:
        logging.info('Processing hpo_id %s', hpo_id)
        bucket = gcs_utils.get_hpo_bucket(hpo_id)
        bucket_items = list_bucket(bucket)
        folder_prefix = _get_submission_folder(bucket, bucket_items, force_run)
        if folder_prefix is None:
            logging.info('No submissions to process in %s bucket %s', hpo_id,
                         bucket)
        else:
            if is_valid_folder_prefix_name(folder_prefix):
                # perform validation
                summary = validate_submission(hpo_id, bucket, bucket_items,
                                              folder_prefix)
                generate_metrics(hpo_id, bucket, folder_prefix, summary)
            else:
                # do not perform validation. Generate empty report and processed.txt
                generate_empty_report(hpo_id, bucket, folder_prefix)
    except BucketDoesNotExistError as bucket_error:
        bucket = bucket_error.bucket
        logging.warning(
            'Bucket `%s` configured for hpo_id `%s` does not exist', bucket,
            hpo_id)
    except HttpError as http_error:
        message = 'Failed to process hpo_id `%s` due to the following HTTP error: %s' % (
            hpo_id, http_error.content.decode())
        logging.exception(message)
Beispiel #21
0
def run_export(hpo_id, folder_prefix):
    """
    this function also changes the datasources.json file
    """
    results = []
    logging.info('running export for hpo_id %s' % hpo_id)
    # TODO : add check for required tables
    hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)
    _reports_prefix = ACHILLES_EXPORT_PREFIX_STRING + hpo_id + "/"
    for export_name in common.ALL_REPORTS:
        sql_path = os.path.join(export.EXPORT_PATH, export_name)
        result = export.export_from_path(sql_path, hpo_id)
        content = json.dumps(result)
        fp = StringIO.StringIO(content)
        result = gcs_utils.upload_object(
            hpo_bucket,
            folder_prefix + _reports_prefix + export_name + '.json', fp)
        results.append(result)

    datasources_json_result = save_datasources_json(hpo_id, folder_prefix)
    results.append(datasources_json_result)

    return results
Beispiel #22
0
 def load_dataset_from_files(dataset_id, path):
     app_id = bq_utils.app_identity.get_application_id()
     bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     test_util.empty_bucket(bucket)
     job_ids = []
     for table in common.CDM_TABLES:
         filename = table + '.csv'
         schema = os.path.join(resources.fields_path, table + '.json')
         f = os.path.join(path, filename)
         if os.path.exists(os.path.join(path, filename)):
             with open(f, 'r') as fp:
                 gcs_utils.upload_object(bucket, filename, fp)
         else:
             test_util.write_cloud_str(bucket, filename, '\n')
         gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename)
         load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True)
         load_job_id = load_results['jobReference']['jobId']
         job_ids.append(load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     test_util.empty_bucket(bucket)
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = bq_utils.get_dataset_id()
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id,
         table_id=table_id)
     expected_observation_ids = [int(row['observation_id'])
                                 for row in resources._csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)]
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [int(row['f'][0]['v']) for row in query_results_response['rows']]
     self.assertListEqual(actual_result, expected_observation_ids)
Beispiel #24
0
 def setUp(self):
     self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID)
     self.gcs_path = '/'.join([self.hpo_bucket, 'dummy'])
     self._empty_bucket()
Beispiel #25
0
 def setUp(self):
     self.hpo_bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     test_util.empty_bucket(self.hpo_bucket)
     test_util.delete_all_tables(bq_utils.get_dataset_id())
 def tearDown(self):
     self._empty_bucket()
     bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
     test_util.empty_bucket(bucket_nyc)
     test_util.empty_bucket(gcs_utils.get_drc_bucket())
     self.testbed.deactivate()
Beispiel #27
0
 def setUpClass(cls):
     fake_bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     dataset_id = bq_utils.get_dataset_id()
     test_util.delete_all_tables(dataset_id)
     test_util.get_synpuf_results_files()
     test_util.populate_achilles(fake_bucket)
Beispiel #28
0
 def tearDown(self):
     self._empty_bucket()
     bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
     test_util.empty_bucket(bucket_nyc)
     test_util.empty_bucket(gcs_utils.get_drc_bucket())
     test_util.delete_all_tables(self.bigquery_dataset_id)
Beispiel #29
0
 def _empty_hpo_buckets(self):
     for hpo_id in self.hpo_ids:
         bucket = gcs_utils.get_hpo_bucket(hpo_id)
         test_util.empty_bucket(bucket)
Beispiel #30
0
def run_validation(hpo_id, force_run=False):
    """
    runs validation for a single hpo_id

    :param hpo_id: which hpo_id to run for
    :param force_run: if True, process the latest submission whether or not it has already been processed before
    :raises
    BucketDoesNotExistError:
      Raised when a configured bucket does not exist
    InternalValidationError:
      Raised when an internal error is encountered during validation
    """
    logging.info(' Validating hpo_id %s' % hpo_id)
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    bucket_items = list_bucket(bucket)
    to_process_folder_list = _get_to_process_list(bucket, bucket_items,
                                                  force_run)

    for folder_prefix in to_process_folder_list:
        logging.info('Processing gs://%s/%s' % (bucket, folder_prefix))
        # separate cdm from the unknown (unexpected) files
        found_cdm_files = []
        unknown_files = []
        found_pii_files = []
        folder_items = [
            item['name'].split('/')[1] for item in bucket_items
            if item['name'].startswith(folder_prefix)
        ]
        for item in folder_items:
            if _is_cdm_file(item):
                found_cdm_files.append(item)
            elif _is_pii_file(item):
                found_pii_files.append(item)
            else:
                is_known_file = item in common.IGNORE_LIST
                if not is_known_file:
                    unknown_files.append(item)

        errors = []
        results = []

        # Create all tables first to simplify downstream processes
        # (e.g. ehr_union doesn't have to check if tables exist)
        for file_name in common.CDM_FILES + common.PII_FILES:
            table_name = file_name.split('.')[0]
            table_id = bq_utils.get_table_id(hpo_id, table_name)
            bq_utils.create_standard_table(table_name,
                                           table_id,
                                           drop_existing=True)

        for cdm_file_name in common.CDM_FILES:
            file_results, file_errors = perform_validation_on_file(
                cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket)
            results.extend(file_results)
            errors.extend(file_errors)

        for pii_file_name in common.PII_FILES:
            file_results, file_errors = perform_validation_on_file(
                pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket)
            results.extend(file_results)
            errors.extend(file_errors)

        # (filename, message) for each unknown file
        warnings = [(unknown_file, UNKNOWN_FILE)
                    for unknown_file in unknown_files]

        # output to GCS
        _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results)
        _save_errors_warnings_in_gcs(bucket, folder_prefix + ERRORS_CSV,
                                     errors, warnings)

        if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix):
            run_achilles(hpo_id)
            run_export(hpo_id=hpo_id, folder_prefix=folder_prefix)

        logging.info('Uploading achilles index files to `gs://%s/%s`.' %
                     (bucket, folder_prefix))
        _upload_achilles_files(hpo_id, folder_prefix)

        now_datetime_string = datetime.datetime.now().strftime(
            '%Y-%m-%dT%H:%M:%S')
        logging.info(
            'Processing complete. Saving timestamp %s to `gs://%s/%s`.' %
            (bucket, now_datetime_string,
             folder_prefix + common.PROCESSED_TXT))
        _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT,
                              now_datetime_string)