Esempio n. 1
0
 def test_hpo_query(self):
     with patch('validation.metrics.completeness.get_cols') as mock_get_cols:
         nyc_cu_cols = self.get_nyc_cu_cols()
         mock_get_cols.return_value = self.get_nyc_cu_cols()
         query = completeness.get_hpo_completeness_query(self.hpo_id)
         # For now checking for expected column expressions
         # TODO find more robust way to test output
         for nyc_cu_col in nyc_cu_cols:
             column_exp = "'%s' AS column_name" % nyc_cu_col[
                 consts.COLUMN_NAME]
             self.assertTrue(column_exp in query)
Esempio n. 2
0
def generate_metrics(hpo_id, bucket, folder_prefix, summary):
    """
    Generate metrics regarding a submission

    :param hpo_id: identifies the HPO site
    :param bucket: name of the bucket with the submission
    :param folder_prefix: folder containing the submission
    :param summary: file summary from validation
     {results: [(file_name, found, parsed, loaded)],
      errors: [(file_name, message)],
      warnings: [(file_name, message)]}
    :return:
    """
    report_data = summary.copy()
    processed_datetime_str = datetime.datetime.now().strftime(
        '%Y-%m-%dT%H:%M:%S')
    error_occurred = False

    # TODO separate query generation, query execution, writing to GCS
    gcs_path = 'gs://%s/%s' % (bucket, folder_prefix)
    report_data[report_consts.HPO_NAME_REPORT_KEY] = get_hpo_name(hpo_id)
    report_data[report_consts.FOLDER_REPORT_KEY] = folder_prefix
    report_data[report_consts.TIMESTAMP_REPORT_KEY] = processed_datetime_str
    results = report_data['results']
    try:
        # TODO modify achilles to run successfully when tables are empty
        # achilles queries will raise exceptions (e.g. division by zero) if files not present
        if all_required_files_loaded(results):
            logging.info('Running achilles on %s.', folder_prefix)
            run_achilles(hpo_id)
            run_export(datasource_id=hpo_id, folder_prefix=folder_prefix)
            logging.info('Uploading achilles index files to `%s`.', gcs_path)
            _upload_achilles_files(hpo_id, folder_prefix)
            heel_error_query = get_heel_error_query(hpo_id)
            report_data[report_consts.HEEL_ERRORS_REPORT_KEY] = query_rows(
                heel_error_query)
        else:
            report_data[
                report_consts.
                SUBMISSION_ERROR_REPORT_KEY] = 'Required files are missing'
            logging.info(
                'Required files are missing in %s. Skipping achilles.',
                gcs_path)

        # non-unique key metrics
        logging.info('Getting non-unique key stats for %s...' % hpo_id)
        nonunique_metrics_query = get_duplicate_counts_query(hpo_id)
        report_data[
            report_consts.NONUNIQUE_KEY_METRICS_REPORT_KEY] = query_rows(
                nonunique_metrics_query)

        # drug class metrics
        logging.info('Getting drug class for %s...' % hpo_id)
        drug_class_metrics_query = get_drug_class_counts_query(hpo_id)
        report_data[report_consts.DRUG_CLASS_METRICS_REPORT_KEY] = query_rows(
            drug_class_metrics_query)

        # missing PII
        logging.info('Getting missing record stats for %s...' % hpo_id)
        missing_pii_query = get_hpo_missing_pii_query(hpo_id)
        missing_pii_results = query_rows(missing_pii_query)
        report_data[report_consts.MISSING_PII_KEY] = missing_pii_results

        # completeness
        logging.info('Getting completeness stats for %s...' % hpo_id)
        completeness_query = completeness.get_hpo_completeness_query(hpo_id)
        report_data[report_consts.COMPLETENESS_REPORT_KEY] = query_rows(
            completeness_query)

        # lab concept metrics
        logging.info('Getting lab concepts for %s...' % hpo_id)
        lab_concept_metrics_query = required_labs.get_lab_concept_summary_query(
            hpo_id)
        report_data[report_consts.LAB_CONCEPT_METRICS_REPORT_KEY] = query_rows(
            lab_concept_metrics_query)

        logging.info(
            'Processing complete. Saving timestamp %s to `gs://%s/%s`.',
            processed_datetime_str, bucket,
            folder_prefix + common.PROCESSED_TXT)
        _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT,
                              processed_datetime_str)

    except HttpError as err:
        # cloud error occurred- log details for troubleshooting
        logging.exception(
            'Failed to generate full report due to the following cloud error:\n\n%s'
            % err.content)
        error_occurred = True

        # re-raise error
        raise err
    finally:
        # report all results collected (attempt even if cloud error occurred)
        report_data[report_consts.ERROR_OCCURRED_REPORT_KEY] = error_occurred
        results_html = hpo_report.render(report_data)
        _write_string_to_file(bucket, folder_prefix + common.RESULTS_HTML,
                              results_html)
    return report_data
Esempio n. 3
0
def generate_metrics(hpo_id, bucket, folder_prefix, summary):
    """
    Generate metrics regarding a submission

    :param hpo_id: identifies the HPO site
    :param bucket: name of the bucket with the submission
    :param folder_prefix: folder containing the submission
    :param summary: file summary from validation
     {results: [(file_name, found, parsed, loaded)],
      errors: [(file_name, message)],
      warnings: [(file_name, message)]}
    :return:
    """
    report_data = summary.copy()
    error_occurred = False

    # TODO separate query generation, query execution, writing to GCS
    gcs_path = f"gs://{bucket}/{folder_prefix}"
    report_data[report_consts.HPO_NAME_REPORT_KEY] = get_hpo_name(hpo_id)
    report_data[report_consts.FOLDER_REPORT_KEY] = folder_prefix
    results = report_data['results']
    try:
        # TODO modify achilles to run successfully when tables are empty
        # achilles queries will raise exceptions (e.g. division by zero) if files not present
        if all_required_files_loaded(results):
            logging.info(f"Running achilles on {folder_prefix}.")
            run_achilles(hpo_id)
            run_export(datasource_id=hpo_id, folder_prefix=folder_prefix)
            logging.info(f"Uploading achilles index files to '{gcs_path}'.")
            _upload_achilles_files(hpo_id, folder_prefix)
            heel_error_query = get_heel_error_query(hpo_id)
            report_data[report_consts.HEEL_ERRORS_REPORT_KEY] = query_rows(
                heel_error_query)
        else:
            report_data[
                report_consts.
                SUBMISSION_ERROR_REPORT_KEY] = "Required files are missing"
            logging.info(
                f"Required files are missing in {gcs_path}. Skipping achilles.")

        # non-unique key metrics
        logging.info(f"Getting non-unique key stats for {hpo_id}")
        nonunique_metrics_query = get_duplicate_counts_query(hpo_id)
        report_data[
            report_consts.NONUNIQUE_KEY_METRICS_REPORT_KEY] = query_rows(
                nonunique_metrics_query)

        # drug class metrics
        logging.info(f"Getting drug class for {hpo_id}")
        drug_class_metrics_query = get_drug_class_counts_query(hpo_id)
        report_data[report_consts.DRUG_CLASS_METRICS_REPORT_KEY] = query_rows(
            drug_class_metrics_query)

        # missing PII
        logging.info(f"Getting missing record stats for {hpo_id}")
        missing_pii_query = get_hpo_missing_pii_query(hpo_id)
        missing_pii_results = query_rows(missing_pii_query)
        report_data[report_consts.MISSING_PII_KEY] = missing_pii_results

        # completeness
        logging.info(f"Getting completeness stats for {hpo_id}")
        completeness_query = completeness.get_hpo_completeness_query(hpo_id)
        report_data[report_consts.COMPLETENESS_REPORT_KEY] = query_rows(
            completeness_query)

        # lab concept metrics
        logging.info(f"Getting lab concepts for {hpo_id}")
        lab_concept_metrics_query = required_labs.get_lab_concept_summary_query(
            hpo_id)
        report_data[report_consts.LAB_CONCEPT_METRICS_REPORT_KEY] = query_rows(
            lab_concept_metrics_query)

        logging.info(f"Processing complete.")
    except HttpError as err:
        # cloud error occurred- log details for troubleshooting
        logging.exception(
            f"Failed to generate full report due to the following cloud error:\n\n{err.content}"
        )
        error_occurred = True
    finally:
        # report all results collected (attempt even if cloud error occurred)
        report_data[report_consts.ERROR_OCCURRED_REPORT_KEY] = error_occurred
    return report_data