Python load_table_from_csv Examples, bq_utils.load_table_from_csv Python Examples

Example #1

0

Show file

File: required_labs.py Project: rfrancis1/curation

def load_measurement_concept_sets_table(project_id, dataset_id):
    """
    Loads the required lab table from resource_files/measurement_concept_sets.csv
    into project_id.ehr_ops

    :param project_id: Project where the dataset resides
    :param dataset_id: Dataset where the required lab table needs to be created
    :return: None
    """

    check_and_copy_tables(project_id, dataset_id)

    try:
        LOGGER.info(
            'Upload {measurement_concept_sets_table}.csv to {dataset_id} in {project_id}'
            .format(
                measurement_concept_sets_table=MEASUREMENT_CONCEPT_SETS_TABLE,
                dataset_id=dataset_id,
                project_id=project_id))

        bq_utils.load_table_from_csv(project_id, dataset_id,
                                     MEASUREMENT_CONCEPT_SETS_TABLE)

    except (oauth2client.client.HttpAccessTokenRefreshError,
            googleapiclient.errors.HttpError):

        LOGGER.exception(
            f"FAILED:  CSV file could not be uploaded:\n{app_identity}")

Example #2

0

Show file

def load_operational_pii_fields_lookup_table(project_id, sandbox_dataset_id):
    """
        Loads the operational pii fields from resources/operational_pii_fields.csv
        into project_id.sandbox_dataset_id.operational_pii_fields in BQ

        :param project_id: Project where the sandbox dataset resides
        :param sandbox_dataset_id: Dataset where the smoking lookup table needs to be created
        :return: None
    """
    bq_utils.load_table_from_csv(project_id,
                                 sandbox_dataset_id,
                                 OPERATIONAL_PII_FIELDS_TABLE,
                                 csv_path=None,
                                 fields=None)

Example #3

0

Show file

File: clean_smoking_ppi.py Project: dcarbone/curation

def load_smoking_lookup_table(project_id, sandbox_dataset_id):
    """
    Loads the smoking lookup table from resource_files/smoking_lookup.csv
    into project_id.sandbox_dataset_id.smoking_lookup in BQ

    :param project_id: Project where the sandbox dataset resides
    :param sandbox_dataset_id: Dataset where the smoking lookup table needs to be created
    :return: None
    """
    bq_utils.load_table_from_csv(project_id,
                                 sandbox_dataset_id,
                                 SMOKING_LOOKUP_TABLE,
                                 csv_path=None,
                                 fields=SMOKING_LOOKUP_FIELDS)

Example #4

0

Show file

    def _load_data(self):

        # Load measurement_concept_sets
        required_labs.load_measurement_concept_sets_table(
            project_id=self.project_id, dataset_id=self.dataset_id)
        # Load measurement_concept_sets_descendants
        required_labs.load_measurement_concept_sets_descendants_table(
            project_id=self.project_id, dataset_id=self.dataset_id)

        # Create concept and concept_ancestor empty tables if not exist
        if not bq_utils.table_exists(common.CONCEPT, self.dataset_id):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
        if not bq_utils.table_exists(common.CONCEPT, self.dataset_id):
            bq_utils.create_standard_table(common.CONCEPT_ANCESTOR,
                                           common.CONCEPT_ANCESTOR)

        # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests
        ehr_measurement_result = bq_utils.load_table_from_csv(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT),
            csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV,
            fields=resources.fields_for(common.MEASUREMENT))
        bq_utils.wait_on_jobs(
            [ehr_measurement_result['jobReference']['jobId']])

Example #5

0

Show file

def load_analyses(hpo_id):
    """
    Populate achilles lookup table
    :param hpo_id: hpo_id of the site to run achilles on
    :return: None
    """
    project_id = app_identity.get_application_id()
    dataset_id = bq_utils.get_dataset_id()
    if hpo_id is None:
        table_prefix = ""
    else:
        table_prefix = hpo_id + '_'
    table_name = table_prefix + ACHILLES_ANALYSIS
    csv_path = os.path.join(resources.resource_files_path,
                            ACHILLES_ANALYSIS + '.csv')
    schema = resources.fields_for(ACHILLES_ANALYSIS)
    bq_utils.load_table_from_csv(project_id, dataset_id, table_name, csv_path,
                                 schema)

Example #6

0

Show file

File: update_questiona_answers_not_mapped_to_omop.py Project: hwang2739/curation

def get_update_questions_answers_not_mapped_to_omop(project_id, dataset_id,
                                                    sandbox_dataset_id):
    """

    This function gets the queries required to update the questions and answers that were unmapped to OMOP concepts

    :param project_id: Name of the project
    :param dataset_id: Name of the dataset where the queries should be run
    :param sandbox_dataset_id: Name of the sandbox dataset
    :return:
    """

    bq_utils.load_table_from_csv(project_id=project_id,
                                 dataset_id=sandbox_dataset_id,
                                 table_name=OLD_MAP_SHORT_CODES_TABLE,
                                 fields=OLD_MAP_SHORT_CODES_TABLE_FIELDS)

    queries_list = []

    # Update concept_ids to questions using OLD_MAP_SHORT_CODES_TABLE.
    query = dict()
    query[cdr_consts.QUERY] = UPDATE_QUESTIONS_MAP_QUERY.format(
        dataset=dataset_id,
        project=project_id,
        old_map=OLD_MAP_SHORT_CODES_TABLE,
        sandbox=sandbox_dataset_id)
    queries_list.append(query)

    # Update concept_ids to answers using OLD_MAP_SHORT_CODES_TABLE.
    query = dict()
    query[cdr_consts.QUERY] = UPDATE_ANSWERS_MAP_QUERY.format(
        dataset=dataset_id,
        project=project_id,
        old_map=OLD_MAP_SHORT_CODES_TABLE,
        sandbox=sandbox_dataset_id)
    queries_list.append(query)

    return queries_list

Example #7

0

Show file

File: bq_utils_test.py Project: all-of-us/curation

    def test_load_table_from_csv(self):
        table_id = 'test_csv_table'
        csv_file = 'load_csv_test_data.csv'
        csv_path = os.path.join(test_util.TEST_DATA_PATH, csv_file)
        with open(csv_path, 'r') as f:
            expected = list(csv.DictReader(f))
        bq_utils.load_table_from_csv(self.project_id, self.dataset_id,
                                     table_id, csv_path, self.TEST_FIELDS)
        q = """ SELECT *
                FROM `{project_id}.{dataset_id}.{table_id}`""".format(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_id=table_id)
        r = bq_utils.query(q)
        actual = bq_utils.response2rows(r)

        # Convert the epoch times to datetime with time zone
        for row in actual:
            row['timestamp_field'] = time.strftime(
                self.DT_FORMAT + ' UTC', time.gmtime(row['timestamp_field']))
        expected.sort(key=lambda row: row['integer_field'])
        actual.sort(key=lambda row: row['integer_field'])
        for i, _ in enumerate(expected):
            self.assertCountEqual(expected[i], actual[i])

Example #8

0

Show file

    def test_html_report_five_person(self, mock_check_cron, mock_first_run,
                                     mock_rdr_date, mock_required_files_loaded):
        mock_required_files_loaded.return_value = False
        mock_first_run.return_value = False
        rdr_date = '2020-01-01'
        mock_rdr_date.return_value = rdr_date
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=self.folder_prefix)
        # load person table in RDR
        bq_utils.load_table_from_csv(self.project_id, self.rdr_dataset_id,
                                     common.PERSON,
                                     test_util.FIVE_PERSONS_PERSON_CSV)

        # Load measurement_concept_sets
        required_labs.load_measurement_concept_sets_table(
            project_id=self.project_id, dataset_id=self.bigquery_dataset_id)
        # Load measurement_concept_sets_descendants
        required_labs.load_measurement_concept_sets_descendants_table(
            project_id=self.project_id, dataset_id=self.bigquery_dataset_id)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, self.folder_prefix + common.RESULTS_HTML)

        # ensure emails are not sent
        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        self.assertFalse(main.is_first_validation_run(folder_items))

        # parse html
        soup = bs(actual_result, parser="lxml", features="lxml")
        missing_pii_html_table = soup.find('table', id='missing_pii')
        table_headers = missing_pii_html_table.find_all('th')
        self.assertEqual('Missing Participant Record Type',
                         table_headers[0].get_text())
        self.assertEqual('Count', table_headers[1].get_text())

        table_rows = missing_pii_html_table.find_next('tbody').find_all('tr')
        missing_record_types = [
            table_row.find('td').text for table_row in table_rows
        ]
        self.assertIn(main_consts.EHR_NO_PII, missing_record_types)
        self.assertIn(main_consts.PII_NO_EHR, missing_record_types)
        self.assertIn(main_consts.EHR_NO_RDR.format(date=rdr_date),
                      missing_record_types)
        self.assertIn(main_consts.EHR_NO_PARTICIPANT_MATCH,
                      missing_record_types)

        required_lab_html_table = soup.find('table', id='required-lab')
        table_headers = required_lab_html_table.find_all('th')
        self.assertEqual(3, len(table_headers))
        self.assertEqual('Ancestor Concept ID', table_headers[0].get_text())
        self.assertEqual('Ancestor Concept Name', table_headers[1].get_text())
        self.assertEqual('Found', table_headers[2].get_text())

        table_rows = required_lab_html_table.find_next('tbody').find_all('tr')
        table_rows_last_column = [
            table_row.find_all('td')[-1] for table_row in table_rows
        ]
        submitted_labs = [
            row for row in table_rows_last_column
            if 'result-1' in row.attrs['class']
        ]
        missing_labs = [
            row for row in table_rows_last_column
            if 'result-0' in row.attrs['class']
        ]
        self.assertTrue(len(table_rows) > 0)
        self.assertTrue(len(submitted_labs) > 0)
        self.assertTrue(len(missing_labs) > 0)

Example #9

0

Show file

def get_queries_health_insurance(project_id, dataset_id, sandbox_dataset_id,
                                 pid_file):
    """
    Queries to run for updating health insurance information

    :param project_id: project id associated with the dataset to run the queries on
    :param dataset_id: dataset id to run the queries on
    :param sandbox_dataset_id: dataset id of the sandbox
    :param pid_file: path to the file containing pids
    :return: list of query dicts
    """

    pids_list = extract_pids_from_file(pid_file)
    pids = ', '.join([str(pid) for pid in pids_list])

    bq_utils.load_table_from_csv(project_id,
                                 sandbox_dataset_id,
                                 INSURANCE_LOOKUP,
                                 csv_path=None,
                                 fields=INSURANCE_LOOKUP_FIELDS)

    queries = []

    sandbox_query = dict()
    sandbox_query[cdr_consts.QUERY] = SANDBOX_CREATE_QUERY.format(
        project_id=project_id,
        combined_dataset_id=dataset_id,
        sandbox_dataset_id=sandbox_dataset_id,
        new_insurance_rows=NEW_INSURANCE_ROWS,
        insurance_lookup=INSURANCE_LOOKUP,
        ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID=
        ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID,
        HCAU_OBSERVATION_SOURCE_CONCEPT_ID=HCAU_OBSERVATION_SOURCE_CONCEPT_ID,
        pids=pids)
    queries.append(sandbox_query)

    invalidate_query = dict()
    invalidate_query[cdr_consts.QUERY] = UPDATE_INVALID_QUERY.format(
        project_id=project_id,
        combined_dataset_id=dataset_id,
        ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID=
        ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID,
        pids=pids)
    queries.append(invalidate_query)

    delete_query = dict()
    delete_query[
        cdr_consts.QUERY] = DELETE_ORIGINAL_FOR_HCAU_PARTICIPANTS.format(
            project_id=project_id,
            combined_dataset_id=dataset_id,
            sandbox_dataset_id=sandbox_dataset_id,
            new_insurance_rows=NEW_INSURANCE_ROWS,
            ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID=
            ORIGINAL_OBSERVATION_SOURCE_CONCEPT_ID)
    queries.append(delete_query)

    insert_query = dict()
    insert_query[
        cdr_consts.QUERY] = INSERT_ANSWERS_FOR_HCAU_PARTICIPANTS.format(
            project_id=project_id,
            combined_dataset_id=dataset_id,
            sandbox_dataset_id=sandbox_dataset_id,
            new_insurance_rows=NEW_INSURANCE_ROWS)
    queries.append(insert_query)

    return queries

Example #10

0

Show file

    def test_html_report_five_person(self, mock_check_cron, mock_first_run,
                                     mock_required_files_loaded,
                                     mock_has_all_required_files,
                                     mock_updated_datetime_object):
        mock_required_files_loaded.return_value = False
        mock_first_run.return_value = False
        mock_has_all_required_files.return_value = True
        mock_updated_datetime_object.return_value = datetime.datetime.today(
        ) - datetime.timedelta(minutes=7)

        for cdm_file in test_util.FIVE_PERSONS_FILES:
            blob_name = f'{self.folder_prefix}{os.path.basename(cdm_file)}'
            test_blob = self.storage_bucket.blob(blob_name)
            test_blob.upload_from_filename(cdm_file)

        # load person table in RDR
        bq_utils.load_table_from_csv(self.project_id, self.rdr_dataset_id,
                                     common.PERSON,
                                     test_util.FIVE_PERSONS_PERSON_CSV)

        # Load measurement_concept_sets
        required_labs.load_measurement_concept_sets_table(
            project_id=self.project_id, dataset_id=self.bigquery_dataset_id)
        # Load measurement_concept_sets_descendants
        required_labs.load_measurement_concept_sets_descendants_table(
            project_id=self.project_id, dataset_id=self.bigquery_dataset_id)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, self.folder_prefix + common.RESULTS_HTML)

        # ensure emails are not sent
        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        self.assertFalse(main.is_first_validation_run(folder_items))

        # parse html
        soup = bs(actual_result, parser="lxml", features="lxml")
        missing_pii_html_table = soup.find('table', id='missing_pii')
        table_headers = missing_pii_html_table.find_all('th')
        self.assertEqual('Missing Participant Record Type',
                         table_headers[0].get_text())
        self.assertEqual('Count', table_headers[1].get_text())

        table_rows = missing_pii_html_table.find_next('tbody').find_all('tr')
        missing_record_types = [
            table_row.find('td').text for table_row in table_rows
        ]
        self.assertIn(main_consts.EHR_NO_PII, missing_record_types)
        self.assertIn(main_consts.PII_NO_EHR, missing_record_types)

        # the missing from RDR component is obsolete (see DC-1932)
        # this is to confirm it was removed successfully from the report
        rdr_date = '2020-01-01'
        self.assertNotIn(main_consts.EHR_NO_RDR.format(date=rdr_date),
                         missing_record_types)
        self.assertIn(main_consts.EHR_NO_PARTICIPANT_MATCH,
                      missing_record_types)

        required_lab_html_table = soup.find('table', id='required-lab')
        table_headers = required_lab_html_table.find_all('th')
        self.assertEqual(3, len(table_headers))
        self.assertEqual('Ancestor Concept ID', table_headers[0].get_text())
        self.assertEqual('Ancestor Concept Name', table_headers[1].get_text())
        self.assertEqual('Found', table_headers[2].get_text())

        table_rows = required_lab_html_table.find_next('tbody').find_all('tr')
        table_rows_last_column = [
            table_row.find_all('td')[-1] for table_row in table_rows
        ]
        submitted_labs = [
            row for row in table_rows_last_column
            if 'result-1' in row.attrs['class']
        ]
        missing_labs = [
            row for row in table_rows_last_column
            if 'result-0' in row.attrs['class']
        ]
        self.assertTrue(len(table_rows) > 0)
        self.assertTrue(len(submitted_labs) > 0)
        self.assertTrue(len(missing_labs) > 0)