Esempio n. 1
0
    def _load_data(self):

        # Load measurement_concept_sets
        required_labs.load_measurement_concept_sets_table(
            project_id=self.project_id, dataset_id=self.dataset_id)
        # Load measurement_concept_sets_descendants
        required_labs.load_measurement_concept_sets_descendants_table(
            project_id=self.project_id, dataset_id=self.dataset_id)

        # Create concept and concept_ancestor empty tables if not exist
        if not bq_utils.table_exists(common.CONCEPT, self.dataset_id):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
        if not bq_utils.table_exists(common.CONCEPT, self.dataset_id):
            bq_utils.create_standard_table(common.CONCEPT_ANCESTOR,
                                           common.CONCEPT_ANCESTOR)

        # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests
        ehr_measurement_result = bq_utils.load_table_from_csv(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT),
            csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV,
            fields=resources.fields_for(common.MEASUREMENT))
        bq_utils.wait_on_jobs(
            [ehr_measurement_result['jobReference']['jobId']])
Esempio n. 2
0
def populate_achilles(hpo_bucket, hpo_id=FAKE_HPO_ID, include_heel=True):
    from validation import achilles, achilles_heel
    import app_identity

    app_id = app_identity.get_application_id()

    test_file_name = achilles.ACHILLES_ANALYSIS + '.csv'
    achilles_analysis_file_path = os.path.join(TEST_DATA_EXPORT_PATH,
                                               test_file_name)
    schema_name = achilles.ACHILLES_ANALYSIS
    write_cloud_file(hpo_bucket, achilles_analysis_file_path)
    gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name
    dataset_id = bq_utils.get_dataset_id()
    table_id = bq_utils.get_table_id(hpo_id, achilles.ACHILLES_ANALYSIS)
    bq_utils.load_csv(schema_name, gcs_path, app_id, dataset_id, table_id)

    table_names = [achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST]
    if include_heel:
        table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS)

    running_jobs = []
    for table_name in table_names:
        test_file_name = table_name + '.csv'
        test_file_path = os.path.join(TEST_DATA_EXPORT_SYNPUF_PATH,
                                      table_name + '.csv')
        write_cloud_file(hpo_bucket, test_file_path)
        gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name
        dataset_id = bq_utils.get_dataset_id()
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        load_results = bq_utils.load_csv(table_name, gcs_path, app_id,
                                         dataset_id, table_id)
        running_jobs.append(load_results['jobReference']['jobId'])
    bq_utils.wait_on_jobs(running_jobs)
Esempio n. 3
0
    def load_test_data(self, hpo_id: str = None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """

        table_name: str = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id: str = bq_utils.get_table_id(hpo_id, table_name)
        else:
            table_id: str = table_name
        test_file_name: str = f'{table_id}.csv'
        test_file_path: str = os.path.join(test_util.TEST_DATA_PATH,
                                           test_file_name)

        target_bucket = self.storage_client.get_bucket(self.bucket)
        test_blob = target_bucket.blob(test_file_name)
        test_blob.upload_from_filename(test_file_path)

        gcs_path: str = f'gs://{self.bucket}/{test_file_name}'
        load_results = bq_utils.load_csv(table_name, gcs_path, self.project_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources.csv_to_list(test_file_path)
 def load_dataset_from_files(dataset_id, path):
     app_id = bq_utils.app_identity.get_application_id()
     bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     test_util.empty_bucket(bucket)
     job_ids = []
     for table in common.CDM_TABLES:
         filename = table + '.csv'
         schema = os.path.join(resources.fields_path, table + '.json')
         f = os.path.join(path, filename)
         if os.path.exists(os.path.join(path, filename)):
             with open(f, 'r') as fp:
                 gcs_utils.upload_object(bucket, filename, fp)
         else:
             test_util.write_cloud_str(bucket, filename, '\n')
         gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket,
                                                      filename=filename)
         load_results = bq_utils.load_csv(schema,
                                          gcs_path,
                                          app_id,
                                          dataset_id,
                                          table,
                                          allow_jagged_rows=True)
         load_job_id = load_results['jobReference']['jobId']
         job_ids.append(load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     test_util.empty_bucket(bucket)
Esempio n. 5
0
def run_heel(hpo_id):
    # very long test
    commands = _get_heel_commands(hpo_id)
    count = 0
    for command in commands:
        count = count + 1
        logging.debug(' ---- running query # {}'.format(count))
        logging.debug(' ---- Running `%s`...\n' % command)
        if sql_wrangle.is_to_temp_table(command):
            table_id = sql_wrangle.get_temp_table_name(command)
            query = sql_wrangle.get_temp_table_query(command)
            insert_query_job_result = bq_utils.query(query, False, table_id)
            query_job_id = insert_query_job_result['jobReference']['jobId']

            incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
            if len(incomplete_jobs) > 0:
                logging.critical('tempresults doesnt get created in 30 secs')
                raise RuntimeError('Tempresults taking too long to create')
        elif sql_wrangle.is_truncate(command):
            table_id = sql_wrangle.get_truncate_table_name(command)
            query = 'DELETE FROM %s WHERE TRUE' % table_id
            bq_utils.query(query)
        elif sql_wrangle.is_drop(command):
            table_id = sql_wrangle.get_drop_table_name(command)
            bq_utils.delete_table(table_id)
        else:
            bq_utils.query(command)
            time.sleep(1)
Esempio n. 6
0
 def _load_datasets(self):
     """
     Load five persons data for each test hpo
     """
     # expected_tables is for testing output
     # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
     expected_tables = dict()
     running_jobs = []
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         output_table = ehr_union.output_table_for(cdm_table)
         expected_tables[output_table] = []
         for hpo_id in self.hpo_ids:
             # upload csv into hpo bucket
             bucket = gcs_utils.get_hpo_bucket(hpo_id)
             if os.path.exists(cdm_file_name):
                 test_util.write_cloud_file(bucket, cdm_file_name)
                 csv_rows = resources._csv_to_list(cdm_file_name)
             else:
                 # results in empty table
                 test_util.write_cloud_str(bucket, cdm_table + '.csv',
                                           'dummy\n')
                 csv_rows = []
             # load table from csv
             result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
             running_jobs.append(result['jobReference']['jobId'])
             expected_tables[output_table] += list(csv_rows)
     incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     self.expected_tables = expected_tables
Esempio n. 7
0
 def _load_datasets(self):
     load_jobs = []
     self.expected_tables = dict()
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         result_table = ehr_merge.result_table_for(cdm_table)
         if os.path.exists(cdm_file_name):
             # one copy for chs, the other for pitt
             csv_rows = resources._csv_to_list(cdm_file_name)
             self.expected_tables[result_table] = csv_rows + list(csv_rows)
             test_util.write_cloud_file(self.chs_bucket, cdm_file_name)
             test_util.write_cloud_file(self.pitt_bucket, cdm_file_name)
         else:
             self.expected_tables[result_table] = []
             test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv',
                                       'dummy\n')
             test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv',
                                       'dummy\n')
         chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table)
         pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table)
         chs_load_job_id = chs_load_results['jobReference']['jobId']
         pitt_load_job_id = pitt_load_results['jobReference']['jobId']
         load_jobs.append(chs_load_job_id)
         load_jobs.append(pitt_load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(load_jobs)
     if len(incomplete_jobs) > 0:
         raise RuntimeError('BigQuery jobs %s failed to complete' %
                            incomplete_jobs)
Esempio n. 8
0
def query(q, dst_table_id, dst_dataset_id, write_disposition='WRITE_APPEND'):
    """
    Run query and save results to a table

    :param q: SQL statement
    :param dst_table_id: save results in a table with the specified id
    :param dst_dataset_id: identifies output dataset
    :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema)
    :return: query result
    """
    query_job_result = bq_utils.query(q,
                                      destination_table_id=dst_table_id,
                                      destination_dataset_id=dst_dataset_id,
                                      write_disposition=write_disposition)
    query_job_id = query_job_result['jobReference']['jobId']
    logging.info(f'Job {query_job_id} started for table {dst_table_id}')
    job_status = query_job_result['status']
    error_result = job_status.get('errorResult')
    if error_result is not None:
        msg = f'Job {query_job_id} failed because: {error_result}'
        raise bq_utils.InvalidOperationError(msg)
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if len(incomplete_jobs) > 0:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
    return query_job_result
Esempio n. 9
0
    def test_load_csv(self):
        from google.appengine.api import app_identity

        app_id = app_identity.get_application_id()
        table_name = 'achilles_analysis'
        schema_file_name = table_name + '.json'
        csv_file_name = table_name + '.csv'
        schema_path = os.path.join(resources.fields_path, schema_file_name)
        local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH,
                                      csv_file_name)
        with open(local_csv_path, 'r') as fp:
            response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name,
                                               fp)
        hpo_bucket = self.hpo_bucket
        gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals()
        dataset_id = bq_utils.get_dataset_id()
        load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id,
                                         dataset_id, table_name)

        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_name))
        query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' %
                                        locals())
        self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
Esempio n. 10
0
def run_heel(hpo_id):
    """
    Run heel commands

    :param hpo_id:  string name for the hpo identifier
    :returns: None
    :raises RuntimeError: Raised if BigQuery takes longer than 30 seconds
        to complete a job on a temporary table
    """
    # very long test
    commands = _get_heel_commands(hpo_id)
    count = 0
    for command in commands:
        count = count + 1
        logging.debug(' ---- running query # {}'.format(count))
        logging.debug(' ---- Running `%s`...\n' % command)
        if sql_wrangle.is_to_temp_table(command):
            table_id = sql_wrangle.get_temp_table_name(command)
            query = sql_wrangle.get_temp_table_query(command)
            insert_query_job_result = bq_utils.query(query, False, table_id)
            query_job_id = insert_query_job_result['jobReference']['jobId']

            incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
            if len(incomplete_jobs) > 0:
                logging.critical('tempresults doesnt get created in 30 secs')
                raise RuntimeError('Tempresults taking too long to create')
        elif sql_wrangle.is_truncate(command):
            table_id = sql_wrangle.get_truncate_table_name(command)
            query = 'DELETE FROM %s WHERE TRUE' % table_id
            bq_utils.query(query)
        elif sql_wrangle.is_drop(command):
            table_id = sql_wrangle.get_drop_table_name(command)
            bq_utils.delete_table(table_id)
        else:
            bq_utils.query(command)
Esempio n. 11
0
def create_mapping_table(hpos_with_visit, project_id, dataset_id):
    """ creates the visit mapping table

    :hpos_with_visit: hpos that should be including the visit occurrence table
    :project_id: project with the dataset
    :dataset_id: dataset with the tables
    :returns: string if visit table failed; otherwise none

    """
    # list of hpos with visit table and creating visit id mapping table queries
    visit_hpo_queries = []
    for hpo in hpos_with_visit:
        visit_hpo_queries.append(VISIT_ID_HPO_BLOCK % locals())
    union_all_blocks = '\n UNION ALL'.join(visit_hpo_queries)
    visit_mapping_query = VIST_ID_MAPPING_QUERY_SKELETON % locals()
    logging.info('Loading ' + VISIT_ID_MAPPING_TABLE)
    query_result = query(visit_mapping_query,
                         destination_table_id=VISIT_ID_MAPPING_TABLE,
                         write_disposition='WRITE_TRUNCATE')
    visit_mapping_query_job_id = query_result['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([visit_mapping_query_job_id])
    if len(incomplete_jobs) == 0:
        query_result = bq_utils.get_job_details(visit_mapping_query_job_id)
        if 'errors' in query_result['status']:
            errors = query_result['status']['errors']
            message = 'Failed to load %s due to the following error(s): %s' % (
                VISIT_ID_MAPPING_TABLE, errors)
            logging.error(message)
            raise RuntimeError(message)
    else:
        message = 'Failed to load %s. Job with ID %s never completed.' % (
            VISIT_ID_MAPPING_TABLE, visit_mapping_query_job_id)
        logging.error(message)
        raise RuntimeError(message)
Esempio n. 12
0
def create_match_values_table(project, rdr_dataset, destination_dataset):
    """
    Get the desired matching values from the combined observation table.

    This retrieves all possible matching values from the observation table
    used in participant matching.  Returned query data is limited to the
    person_id, observation_concept_id, and the value_as_string fields.

    :param project:  The project name to query
    :param rdr_dataset:  The rdr dataset name to query
    :param destination_dataset:  The dataset to write the result table to

    :return: The name of the interim table
    """
    query_string = consts.ALL_PPI_OBSERVATION_VALUES.format(
        project=project,
        dataset=rdr_dataset,
        table=consts.OBSERVATION_TABLE,
        pii_list=','.join(consts.PII_CODES_LIST))

    LOGGER.info("Participant validation ran the query\n%s", query_string)
    results = bq_utils.query(query_string,
                             destination_dataset_id=destination_dataset,
                             destination_table_id=consts.ID_MATCH_TABLE,
                             write_disposition='WRITE_TRUNCATE',
                             batch=True)

    query_job_id = results['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if incomplete_jobs != []:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)

    return consts.ID_MATCH_TABLE
Esempio n. 13
0
 def test_load_ehr_observation(self):
     observation_query = "SELECT observation_id FROM {}.{} ORDER BY observation_id"
     hpo_id = 'pitt'
     expected_observation_ids = [
         int(row['observation_id']) for row in resources._csv_to_list(
             PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id),
                                 'observation.csv', fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_string = observation_query.format(bq_utils.get_dataset_id(),
                                             'pitt_observation')
     query_results_response = bq_utils.query_table(query_string)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertListEqual(actual_result, expected_observation_ids)
Esempio n. 14
0
def retraction_query_runner(queries):
    query_job_ids = []
    for query_dict in queries:
        logger.debug('Retracting from %s.%s using query %s' %
                     (query_dict[DEST_DATASET], query_dict[DEST_TABLE],
                      query_dict[QUERY]))
        if query_dict[DELETE_FLAG]:
            job_results = bq_utils.query(q=query_dict[QUERY], batch=True)
            rows_affected = job_results['numDmlAffectedRows']
            logger.debug('%s rows deleted from %s.%s' %
                         (rows_affected, query_dict[DEST_DATASET],
                          query_dict[DEST_TABLE]))
        else:
            job_results = bq_utils.query(
                q=query_dict[QUERY],
                destination_table_id=query_dict[DEST_TABLE],
                write_disposition=WRITE_TRUNCATE,
                destination_dataset_id=query_dict[DEST_DATASET],
                batch=True)
        query_job_id = job_results['jobReference']['jobId']
        query_job_ids.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(query_job_ids)
    if incomplete_jobs:
        logger.debug('Failed on {count} job ids {ids}'.format(
            count=len(incomplete_jobs), ids=incomplete_jobs))
        logger.debug('Terminating retraction')
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
Esempio n. 15
0
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = self.dataset_id
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id, table_id=table_id)
     expected_observation_ids = [
         int(row['observation_id'])
         for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id))
     bucket_blob = sc_bucket.blob('observation.csv')
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         bucket_blob.upload_from_file(fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertCountEqual(actual_result, expected_observation_ids)
Esempio n. 16
0
def run_analyses(hpo_id):
    """
    Run the achilles analyses
    :param hpo_id:
    :return:
    """
    commands = _get_run_analysis_commands(hpo_id)
    for command in commands:
        logging.debug(' ---- Running `%s`...\n' % command)
        if sql_wrangle.is_to_temp_table(command):
            table_id = sql_wrangle.get_temp_table_name(command)
            query = sql_wrangle.get_temp_table_query(command)
            insert_query_job_result = bq_utils.query(query, False, table_id)
            query_job_id = insert_query_job_result['jobReference']['jobId']

            incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
            if len(incomplete_jobs) > 0:
                logging.critical('tempresults doesnt get created in 15 secs')
                raise RuntimeError('Tempresults taking too long to create')
        elif sql_wrangle.is_truncate(command):
            table_id = sql_wrangle.get_truncate_table_name(command)
            if bq_utils.table_exists(table_id):
                bq_utils.delete_table(table_id)
        elif sql_wrangle.is_drop(command):
            table_id = sql_wrangle.get_drop_table_name(command)
            if bq_utils.table_exists(table_id):
                bq_utils.delete_table(table_id)
        else:
            bq_utils.query(command)
Esempio n. 17
0
    def test_merge_with_unmatched_schema(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'),
                                    'measurement.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'measurement')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(
            len(incomplete_jobs), 0,
            'loading tables {},{} timed out'.format('nyc_measurement',
                                                    'pitt_person'))

        table_names = ['nyc_measurement', 'pitt_person']
        success, error = bq_utils.merge_tables(bq_utils.get_dataset_id(),
                                               table_names,
                                               bq_utils.get_dataset_id(),
                                               'merged_nyc_pitt')
        self.assertFalse(success)
Esempio n. 18
0
def merge(dataset_id, project_id):
    """merge hpo ehr data

    :dataset_id: source and target dataset
    :project_id: project in which everything happens
    :returns: list of tables generated successfully

    """
    logging.info('Starting merge')
    existing_tables = bq_utils.list_dataset_contents(dataset_id)
    hpos_to_merge = []
    hpos_with_visit = []
    for item in resources.hpo_csv():
        hpo_id = item['hpo_id']
        if hpo_id + '_person' in existing_tables:
            hpos_to_merge.append(hpo_id)
        if hpo_id + '_visit_occurrence' in existing_tables:
            hpos_with_visit.append(hpo_id)
    logging.info('HPOs to merge: %s' % hpos_to_merge)
    logging.info('HPOs with visit_occurrence: %s' % hpos_with_visit)
    create_mapping_table(hpos_with_visit, project_id, dataset_id)

    # before loading [drop and] create all tables to ensure they are set up properly
    for cdm_file_name in common.CDM_FILES:
        cdm_table_name = cdm_file_name.split('.')[0]
        result_table = result_table_for(cdm_table_name)
        bq_utils.create_standard_table(cdm_table_name,
                                       result_table,
                                       drop_existing=True)

    jobs_to_wait_on = []
    for table_name in common.CDM_TABLES:
        q = construct_query(table_name, hpos_to_merge, hpos_with_visit,
                            project_id, dataset_id)
        logging.info('Merging table: ' + table_name)
        result_table = result_table_for(table_name)
        query_result = query(q,
                             destination_table_id=result_table,
                             write_disposition='WRITE_TRUNCATE')
        query_job_id = query_result['jobReference']['jobId']
        jobs_to_wait_on.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(jobs_to_wait_on)
    if len(incomplete_jobs) == 0:
        tables_created = []
        for job_id in jobs_to_wait_on:
            job_details = bq_utils.get_job_details(job_id)
            status = job_details['status']
            table = job_details['configuration']['query']['destinationTable'][
                'tableId']
            if 'errors' in status:
                logging.error('Job ID %s errors: %s' %
                              (job_id, status['errors']))
            else:
                tables_created.append(table)
        return tables_created
    else:
        message = "Merge failed because job id(s) %s did not complete." % incomplete_jobs
        logging.error(message)
        raise RuntimeError(message)
Esempio n. 19
0
    def _load_datasets(self):
        """
        Load five persons data for nyc and pitt test hpo and rdr data for the excluded_hpo
        # expected_tables is for testing output
        # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
        """
        expected_tables: dict = {}
        running_jobs: list = []
        for cdm_table in resources.CDM_TABLES:
            output_table: str = ehr_union.output_table_for(cdm_table)
            expected_tables[output_table] = []
            for hpo_id in self.hpo_ids:
                # upload csv into hpo bucket
                cdm_filename: str = f'{cdm_table}.csv'
                if hpo_id == NYC_HPO_ID:
                    cdm_filepath: str = os.path.join(
                        test_util.FIVE_PERSONS_PATH, cdm_filename)
                elif hpo_id == PITT_HPO_ID:
                    cdm_filepath: str = os.path.join(
                        test_util.PITT_FIVE_PERSONS_PATH, cdm_filename)
                elif hpo_id == EXCLUDED_HPO_ID:
                    if cdm_table in [
                            'observation', 'person', 'visit_occurrence'
                    ]:
                        cdm_filepath: str = os.path.join(
                            test_util.RDR_PATH, cdm_filename)
                bucket: str = gcs_utils.get_hpo_bucket(hpo_id)
                gcs_bucket = self.storage_client.get_bucket(bucket)
                if os.path.exists(cdm_filepath):

                    csv_rows = resources.csv_to_list(cdm_filepath)
                    cdm_blob = gcs_bucket.blob(cdm_filename)
                    cdm_blob.upload_from_filename(cdm_filepath)

                else:
                    # results in empty table
                    cdm_blob = gcs_bucket.blob(cdm_filename)
                    cdm_blob.upload_from_string('dummy\n')
                    csv_rows: list = []
                # load table from csv
                result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
                running_jobs.append(result['jobReference']['jobId'])
                if hpo_id != EXCLUDED_HPO_ID:
                    expected_tables[output_table] += list(csv_rows)
        # ensure person to observation output is as expected
        output_table_person: str = ehr_union.output_table_for(common.PERSON)
        output_table_observation: str = ehr_union.output_table_for(
            common.OBSERVATION)
        expected_tables[output_table_observation] += 4 * expected_tables[
            output_table_person]

        incomplete_jobs: list = bq_utils.wait_on_jobs(running_jobs)
        if len(incomplete_jobs) > 0:
            message: str = "Job id(s) %s failed to complete" % incomplete_jobs
            raise RuntimeError(message)
        self.expected_tables = expected_tables
Esempio n. 20
0
def perform_validation_on_file(file_name, found_file_names, hpo_id,
                               folder_prefix, bucket):
    """
    Attempts to load a csv file into BigQuery

    :param file_name: name of the file to validate
    :param found_file_names: files found in the submission folder
    :param hpo_id: identifies the hpo site
    :param folder_prefix: directory containing the submission
    :param bucket: bucket containing the submission
    :return: tuple (results, errors) where
     results is list of tuples (file_name, found, parsed, loaded)
     errors is list of tuples (file_name, message)
    """
    errors = []
    results = []
    logging.info(f"Validating file '{file_name}'")
    found = parsed = loaded = 0
    table_name = file_name.split('.')[0]

    if file_name in found_file_names:
        found = 1
        load_results = bq_utils.load_from_csv(hpo_id, table_name,
                                              folder_prefix)
        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])

        if not incomplete_jobs:
            job_resource = bq_utils.get_job_details(job_id=load_job_id)
            job_status = job_resource['status']
            if 'errorResult' in job_status:
                # These are issues (which we report back) as opposed to internal errors
                issues = [item['message'] for item in job_status['errors']]
                errors.append((file_name, ' || '.join(issues)))
                logging.info(
                    f"Issues found in gs://{bucket}/{folder_prefix}/{file_name}"
                )
                for issue in issues:
                    logging.info(issue)
            else:
                # Processed ok
                parsed = loaded = 1
        else:
            # Incomplete jobs are internal unrecoverable errors.
            # Aborting the process allows for this submission to be validated when system recovers.
            message = (
                f"Loading hpo_id '{hpo_id}' table '{table_name}' failed because "
                f"job id '{load_job_id}' did not complete.\n")
            message += f"Aborting processing 'gs://{bucket}/{folder_prefix}'."
            logging.error(message)
            raise InternalValidationError(message)

    if file_name in common.SUBMISSION_FILES:
        results.append((file_name, found, parsed, loaded))

    return results, errors
Esempio n. 21
0
    def test_integration_queries_to_retract_from_fake_dataset(self, mock_list_datasets, mock_is_ehr_dataset):
        mock_list_datasets.return_value = [{'id': self.project_id+':'+self.bq_dataset_id}]
        mock_is_ehr_dataset.return_value = True

        job_ids = []
        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(dataset_id=self.bq_dataset_id,
                                                                      table_id=hpo_table,
                                                                      pids=retract_data_bq.int_list_to_bq(
                                                                                self.person_ids))
            retract_data_bq.logger.debug('Preparing to load table %s.%s' % (self.bq_dataset_id,
                                                                            hpo_table))
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id)
            retract_data_bq.logger.debug('Loading table %s.%s' % (self.bq_dataset_id,
                                                                  hpo_table))
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)
        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete')
        retract_data_bq.logger.debug('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            result = bq_utils.query(row_count_queries[table])
            expected_row_count[table] = retract_data_bq.to_int(result['totalRows'])

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_before_retraction = {}
        for row in result:
            row_count_before_retraction[row['table_id']] = row['row_count']
        deid_flag = False

        # perform retraction
        retract_data_bq.run_retraction(self.test_project_id, self.person_ids, self.hpo_id, deid_flag)

        # find actual deleted rows
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_after_retraction = {}
        for row in result:
            row_count_after_retraction[row['table_id']] = row['row_count']
        for table in expected_row_count:
            self.assertEqual(expected_row_count[table],
                             row_count_before_retraction[table] - row_count_after_retraction[table])
Esempio n. 22
0
def load_measurement_concept_sets_descendants_table(project_id, dataset_id):
    """
    Loads the measurement_concept_sets_descendants table using LOINC group and LOINC hierarchy
    into project_id.ehr_ops

    :param project_id: Project where the dataset resides
    :param dataset_id: Dataset where the required lab table needs to be created
    :return: None
    """

    check_and_copy_tables(project_id, dataset_id)

    identify_labs_query = IDENTIFY_LABS_QUERY.format(
        project_id=project_id,
        ehr_ops_dataset_id=dataset_id,
        vocab_dataset_id=dataset_id,
        measurement_concept_sets=MEASUREMENT_CONCEPT_SETS_TABLE)

    try:
        LOGGER.info(f"Running query {identify_labs_query}")
        results = bq_utils.query(
            identify_labs_query,
            use_legacy_sql=False,
            destination_table_id=MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE,
            retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT,
            write_disposition=bq_consts.WRITE_TRUNCATE,
            destination_dataset_id=dataset_id,
            batch=None)

    except (oauth2client.client.HttpAccessTokenRefreshError,
            googleapiclient.errors.HttpError):
        LOGGER.exception(f"FAILED:  Clean rule not executed:\n{app_identity}")

    query_job_id = results['jobReference']['jobId']
    bq_utils.wait_on_jobs([query_job_id])

    updated_rows = results.get("totalRows")
    if updated_rows is not None:
        LOGGER.info(
            f"Query returned {updated_rows} rows for {dataset_id}.{MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE}"
        )
    def load_test_data(self, hpo_id=None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """
        schema_path = os.path.join(resources.fields_path,
                                   common.ACHILLES_HEEL_RESULTS + '.json')
        table_id = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id = bq_utils.get_table_id(hpo_id,
                                             common.ACHILLES_HEEL_RESULTS)
        test_file_name = table_id + '.csv'
        test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name)
        test_util.write_cloud_file(self.bucket, test_file_path)
        gcs_path = 'gs://' + self.bucket + '/' + test_file_name
        load_results = bq_utils.load_csv(schema_path, gcs_path, self.app_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources._csv_to_list(test_file_path)
Esempio n. 24
0
def populate_achilles(hpo_id=FAKE_HPO_ID, include_heel=True):
    from validation import achilles, achilles_heel
    import app_identity

    app_id = app_identity.get_application_id()
    test_resources_bucket = RESOURCES_BUCKET_FMT.format(project_id=app_id)
    table_names = [
        achilles.ACHILLES_ANALYSIS, achilles.ACHILLES_RESULTS,
        achilles.ACHILLES_RESULTS_DIST
    ]
    if include_heel:
        table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS)

    running_jobs = []
    for table_name in table_names:
        gcs_path = f'gs://{test_resources_bucket}/{table_name}.csv'
        dataset_id = bq_utils.get_dataset_id()
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        load_results = bq_utils.load_csv(table_name, gcs_path, app_id,
                                         dataset_id, table_id)
        running_jobs.append(load_results['jobReference']['jobId'])
    bq_utils.wait_on_jobs(running_jobs)
Esempio n. 25
0
    def test_query_result(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)

        load_job_id = result['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(PERSON))

        table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON)
        q = 'SELECT person_id FROM %s' % table_id
        result = bq_utils.query(q)
        self.assertEqual(5, int(result['totalRows']))
Esempio n. 26
0
    def test_load_cdm_csv(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)
        self.assertEqual(result['status']['state'], 'RUNNING')

        load_job_id = result['jobReference']['jobId']
        table_id = result['configuration']['load']['destinationTable']['tableId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id))
        table_info = bq_utils.get_table_info(table_id)
        num_rows = table_info.get('numRows')
        self.assertEqual(num_rows, '5')
Esempio n. 27
0
    def test_merge_with_good_data(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        nyc_person_ids = [
            int(row['person_id'])
            for row in resources._csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV)
        ]
        pitt_person_ids = [
            int(row['person_id'])
            for row in resources._csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV)
        ]
        expected_result = nyc_person_ids + pitt_person_ids
        expected_result.sort()

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(
            len(incomplete_jobs), 0,
            'loading tables {},{} timed out'.format('nyc_person',
                                                    'pitt_person'))

        dataset_id = bq_utils.get_dataset_id()
        table_ids = ['nyc_person', 'pitt_person']
        merged_table_id = 'merged_nyc_pitt'
        success_flag, error = bq_utils.merge_tables(dataset_id, table_ids,
                                                    dataset_id,
                                                    merged_table_id)

        self.assertTrue(success_flag)
        self.assertEqual(error, "")

        query_string = 'SELECT person_id FROM {dataset_id}.{table_id}'.format(
            dataset_id=dataset_id, table_id=merged_table_id)
        merged_query_job_result = bq_utils.query(query_string)

        self.assertIsNone(merged_query_job_result.get('errors', None))
        actual_result = [
            int(row['f'][0]['v']) for row in merged_query_job_result['rows']
        ]
        actual_result.sort()
        self.assertListEqual(expected_result, actual_result)
Esempio n. 28
0
 def test_query_result(self):
     sc_bucket = self.client.get_bucket(self.hpo_bucket)
     bucket_blob = sc_bucket.blob('person.csv')
     with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
         bucket_blob.upload_from_file(fp)
     result = bq_utils.load_cdm_csv(FAKE_HPO_ID, common.PERSON)
     load_job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'loading table {} timed out'.format(common.PERSON))
     table_id = bq_utils.get_table_id(FAKE_HPO_ID, common.PERSON)
     q = 'SELECT person_id FROM %s' % table_id
     result = bq_utils.query(q)
     self.assertEqual(5, int(result['totalRows']))
Esempio n. 29
0
def query(q, dst_table_id, write_disposition='WRITE_APPEND'):
    """
    Run query and block until job is done
    :param q: SQL statement
    :param dst_table_id: if set, output is saved in a table with the specified id
    :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema)
    """
    dst_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
    query_job_result = bq_utils.query(q, destination_table_id=dst_table_id, write_disposition=write_disposition,
                                      destination_dataset_id=dst_dataset_id)
    query_job_id = query_job_result['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if len(incomplete_jobs) > 0:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
Esempio n. 30
0
 def load_dataset_from_files(dataset_id, path, mappings=False):
     bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     test_util.empty_bucket(bucket)
     job_ids = []
     for table in resources.CDM_TABLES:
         job_ids.append(CombineEhrRdrTest._upload_file_to_bucket(bucket, dataset_id, path, table))
         if mappings and table in DOMAIN_TABLES:
             mapping_table = '_mapping_{table}'.format(table=table)
             job_ids.append(CombineEhrRdrTest._upload_file_to_bucket(bucket, dataset_id, path, mapping_table))
     incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     test_util.empty_bucket(bucket)