Ejemplo n.º 1
0
def merge(dataset_id, project_id):
    """merge hpo ehr data

    :dataset_id: source and target dataset
    :project_id: project in which everything happens
    :returns: list of tables generated successfully

    """
    logging.info('Starting merge')
    existing_tables = bq_utils.list_dataset_contents(dataset_id)
    hpos_to_merge = []
    hpos_with_visit = []
    for item in resources.hpo_csv():
        hpo_id = item['hpo_id']
        if hpo_id + '_person' in existing_tables:
            hpos_to_merge.append(hpo_id)
        if hpo_id + '_visit_occurrence' in existing_tables:
            hpos_with_visit.append(hpo_id)
    logging.info('HPOs to merge: %s' % hpos_to_merge)
    logging.info('HPOs with visit_occurrence: %s' % hpos_with_visit)
    create_mapping_table(hpos_with_visit, project_id, dataset_id)

    # before loading [drop and] create all tables to ensure they are set up properly
    for cdm_file_name in common.CDM_FILES:
        cdm_table_name = cdm_file_name.split('.')[0]
        result_table = result_table_for(cdm_table_name)
        bq_utils.create_standard_table(cdm_table_name,
                                       result_table,
                                       drop_existing=True)

    jobs_to_wait_on = []
    for table_name in common.CDM_TABLES:
        q = construct_query(table_name, hpos_to_merge, hpos_with_visit,
                            project_id, dataset_id)
        logging.info('Merging table: ' + table_name)
        result_table = result_table_for(table_name)
        query_result = query(q,
                             destination_table_id=result_table,
                             write_disposition='WRITE_TRUNCATE')
        query_job_id = query_result['jobReference']['jobId']
        jobs_to_wait_on.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(jobs_to_wait_on)
    if len(incomplete_jobs) == 0:
        tables_created = []
        for job_id in jobs_to_wait_on:
            job_details = bq_utils.get_job_details(job_id)
            status = job_details['status']
            table = job_details['configuration']['query']['destinationTable'][
                'tableId']
            if 'errors' in status:
                logging.error('Job ID %s errors: %s' %
                              (job_id, status['errors']))
            else:
                tables_created.append(table)
        return tables_created
    else:
        message = "Merge failed because job id(s) %s did not complete." % incomplete_jobs
        logging.error(message)
        raise RuntimeError(message)
Ejemplo n.º 2
0
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = self.dataset_id
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id, table_id=table_id)
     expected_observation_ids = [
         int(row['observation_id'])
         for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id))
     bucket_blob = sc_bucket.blob('observation.csv')
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         bucket_blob.upload_from_file(fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertCountEqual(actual_result, expected_observation_ids)
Ejemplo n.º 3
0
def create_mapping_table(hpos_with_visit, project_id, dataset_id):
    """ creates the visit mapping table

    :hpos_with_visit: hpos that should be including the visit occurrence table
    :project_id: project with the dataset
    :dataset_id: dataset with the tables
    :returns: string if visit table failed; otherwise none

    """
    # list of hpos with visit table and creating visit id mapping table queries
    visit_hpo_queries = []
    for hpo in hpos_with_visit:
        visit_hpo_queries.append(VISIT_ID_HPO_BLOCK % locals())
    union_all_blocks = '\n UNION ALL'.join(visit_hpo_queries)
    visit_mapping_query = VIST_ID_MAPPING_QUERY_SKELETON % locals()
    logging.info('Loading ' + VISIT_ID_MAPPING_TABLE)
    query_result = query(visit_mapping_query,
                         destination_table_id=VISIT_ID_MAPPING_TABLE,
                         write_disposition='WRITE_TRUNCATE')
    visit_mapping_query_job_id = query_result['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([visit_mapping_query_job_id])
    if len(incomplete_jobs) == 0:
        query_result = bq_utils.get_job_details(visit_mapping_query_job_id)
        if 'errors' in query_result['status']:
            errors = query_result['status']['errors']
            message = 'Failed to load %s due to the following error(s): %s' % (
                VISIT_ID_MAPPING_TABLE, errors)
            logging.error(message)
            raise RuntimeError(message)
    else:
        message = 'Failed to load %s. Job with ID %s never completed.' % (
            VISIT_ID_MAPPING_TABLE, visit_mapping_query_job_id)
        logging.error(message)
        raise RuntimeError(message)
Ejemplo n.º 4
0
 def test_load_ehr_observation(self):
     observation_query = "SELECT observation_id FROM {}.{} ORDER BY observation_id"
     hpo_id = 'pitt'
     expected_observation_ids = [
         int(row['observation_id']) for row in resources._csv_to_list(
             PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id),
                                 'observation.csv', fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_string = observation_query.format(bq_utils.get_dataset_id(),
                                             'pitt_observation')
     query_results_response = bq_utils.query_table(query_string)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertListEqual(actual_result, expected_observation_ids)
Ejemplo n.º 5
0
def perform_validation_on_file(file_name, found_file_names, hpo_id,
                               folder_prefix, bucket):
    """
    Attempts to load a csv file into BigQuery

    :param file_name: name of the file to validate
    :param found_file_names: files found in the submission folder
    :param hpo_id: identifies the hpo site
    :param folder_prefix: directory containing the submission
    :param bucket: bucket containing the submission
    :return: tuple (results, errors) where
     results is list of tuples (file_name, found, parsed, loaded)
     errors is list of tuples (file_name, message)
    """
    errors = []
    results = []
    logging.info(f"Validating file '{file_name}'")
    found = parsed = loaded = 0
    table_name = file_name.split('.')[0]

    if file_name in found_file_names:
        found = 1
        load_results = bq_utils.load_from_csv(hpo_id, table_name,
                                              folder_prefix)
        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])

        if not incomplete_jobs:
            job_resource = bq_utils.get_job_details(job_id=load_job_id)
            job_status = job_resource['status']
            if 'errorResult' in job_status:
                # These are issues (which we report back) as opposed to internal errors
                issues = [item['message'] for item in job_status['errors']]
                errors.append((file_name, ' || '.join(issues)))
                logging.info(
                    f"Issues found in gs://{bucket}/{folder_prefix}/{file_name}"
                )
                for issue in issues:
                    logging.info(issue)
            else:
                # Processed ok
                parsed = loaded = 1
        else:
            # Incomplete jobs are internal unrecoverable errors.
            # Aborting the process allows for this submission to be validated when system recovers.
            message = (
                f"Loading hpo_id '{hpo_id}' table '{table_name}' failed because "
                f"job id '{load_job_id}' did not complete.\n")
            message += f"Aborting processing 'gs://{bucket}/{folder_prefix}'."
            logging.error(message)
            raise InternalValidationError(message)

    if file_name in common.SUBMISSION_FILES:
        results.append((file_name, found, parsed, loaded))

    return results, errors
Ejemplo n.º 6
0
def perform_validation_on_file(file_name, found_file_names, hpo_id,
                               folder_prefix, bucket):
    errors = []
    results = []
    logging.info('Validating file `{file_name}`'.format(file_name=file_name))
    found = parsed = loaded = 0
    table_name = file_name.split('.')[0]

    if file_name in found_file_names:
        found = 1
        load_results = bq_utils.load_from_csv(hpo_id, table_name,
                                              folder_prefix)
        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])

        if len(incomplete_jobs) == 0:
            job_resource = bq_utils.get_job_details(job_id=load_job_id)
            job_status = job_resource['status']
            if 'errorResult' in job_status:
                # These are issues (which we report back) as opposed to internal errors
                issues = [item['message'] for item in job_status['errors']]
                errors.append((file_name, ' || '.join(issues)))
                logging.info(
                    'Issues found in gs://{bucket}/{folder_prefix}/{file_name}'
                    .format(bucket=bucket,
                            folder_prefix=folder_prefix,
                            file_name=file_name))
                for issue in issues:
                    logging.info(issue)
            else:
                # Processed ok
                parsed = loaded = 1
        else:
            # Incomplete jobs are internal unrecoverable errors.
            # Aborting the process allows for this submission to be validated when system recovers.
            message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.'
            message = message_fmt % (hpo_id, table_name, load_job_id)
            message += ' Aborting processing `gs://%s/%s`.' % (bucket,
                                                               folder_prefix)
            logging.error(message)
            raise InternalValidationError(message)

    if file_name in common.REQUIRED_FILES or found:
        results.append((file_name, found, parsed, loaded))

    return results, errors
Ejemplo n.º 7
0
def run_validation(hpo_id, force_run=False):
    """
    runs validation for a single hpo_id

    :param hpo_id: which hpo_id to run for
    :param force_run: if True, process the latest submission whether or not it has already been processed before
    :raises
    BucketDoesNotExistError:
      Raised when a configured bucket does not exist
    InternalValidationError:
      Raised when an internal error is encountered during validation
    """
    logging.info(' Validating hpo_id %s' % hpo_id)
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    bucket_items = list_bucket(bucket)
    to_process_folder_list = _get_to_process_list(bucket, bucket_items,
                                                  force_run)

    for folder_prefix in to_process_folder_list:
        logging.info('Processing gs://%s/%s' % (bucket, folder_prefix))
        # separate cdm from the unknown (unexpected) files
        found_cdm_files = []
        unknown_files = []
        folder_items = [
            item['name'].split('/')[1] for item in bucket_items
            if item['name'].startswith(folder_prefix)
        ]
        for item in folder_items:
            if _is_cdm_file(item):
                found_cdm_files.append(item)
            else:
                is_known_file = item in common.IGNORE_LIST or is_pii(item)
                if not is_known_file:
                    unknown_files.append(item)

        errors = []
        results = []
        found_cdm_file_names = found_cdm_files

        # Create all tables first to simplify downstream processes
        # (e.g. ehr_union doesn't have to check if tables exist)
        for cdm_file_name in common.CDM_FILES:
            cdm_table_name = cdm_file_name.split('.')[0]
            table_id = bq_utils.get_table_id(hpo_id, cdm_table_name)
            bq_utils.create_standard_table(cdm_table_name,
                                           table_id,
                                           drop_existing=True)

        for cdm_file_name in common.CDM_FILES:
            logging.info('Validating file `{file_name}`'.format(
                file_name=cdm_file_name))
            found = parsed = loaded = 0
            cdm_table_name = cdm_file_name.split('.')[0]

            if cdm_file_name in found_cdm_file_names:
                found = 1
                load_results = bq_utils.load_cdm_csv(hpo_id, cdm_table_name,
                                                     folder_prefix)
                load_job_id = load_results['jobReference']['jobId']
                incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])

                if len(incomplete_jobs) == 0:
                    job_resource = bq_utils.get_job_details(job_id=load_job_id)
                    job_status = job_resource['status']
                    if 'errorResult' in job_status:
                        # These are issues (which we report back) as opposed to internal errors
                        issues = [
                            item['message'] for item in job_status['errors']
                        ]
                        errors.append((cdm_file_name, ' || '.join(issues)))
                        logging.info(
                            'Issues found in gs://{bucket}/{folder_prefix}/{cdm_file_name}'
                            .format(bucket=bucket,
                                    folder_prefix=folder_prefix,
                                    cdm_file_name=cdm_file_name))
                        for issue in issues:
                            logging.info(issue)
                    else:
                        # Processed ok
                        parsed = loaded = 1
                else:
                    # Incomplete jobs are internal unrecoverable errors.
                    # Aborting the process allows for this submission to be validated when system recovers.
                    message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.'
                    message = message_fmt % (hpo_id, cdm_table_name,
                                             load_job_id)
                    message += ' Aborting processing `gs://%s/%s`.' % (
                        bucket, folder_prefix)
                    logging.error(message)
                    raise InternalValidationError(message)

            if cdm_file_name in common.REQUIRED_FILES or found:
                results.append((cdm_file_name, found, parsed, loaded))

        # (filename, message) for each unknown file
        warnings = [(unknown_file, UNKNOWN_FILE)
                    for unknown_file in unknown_files]

        # output to GCS
        _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results)
        _save_warnings_in_gcs(bucket, folder_prefix + WARNINGS_CSV, warnings)
        _save_errors_in_gcs(bucket, folder_prefix + ERRORS_CSV, errors)

        if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix):
            run_achilles(hpo_id)
            run_export(hpo_id=hpo_id, folder_prefix=folder_prefix)

        logging.info('Uploading achilles index files to `gs://%s/%s`.' %
                     (bucket, folder_prefix))
        _upload_achilles_files(hpo_id, folder_prefix)

        now_datetime_string = datetime.datetime.now().strftime(
            '%Y-%m-%dT%H:%M:%S')
        logging.info(
            'Processing complete. Saving timestamp %s to `gs://%s/%s`.' %
            (bucket, now_datetime_string,
             folder_prefix + common.PROCESSED_TXT))
        _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT,
                              now_datetime_string)