Example #1
0
def get_pii_values(project, pii_dataset, hpo, table, field):
    """
    Get values from the site's PII table.

    :param project: The name of the project to query for pii values
    :param pii_dataset:  The name of the dataset to query for pii values.
    :param hpo:  hpo string to use when identifying table names for lookup.
    :param table:  The name of the table suffix to query for pii values.
    :param field:  The field name to look up values for

    :return:  A list of tuples with the first tuple element as the person_id
    and the second tuple element as the phone number.
    [(1, '5558675309'), (48, '5558004600'), (99, '5551002000')]
    """
    query_string = consts.PII_VALUES.format(project=project,
                                            dataset=pii_dataset,
                                            hpo_site_str=hpo,
                                            field=field,
                                            table_suffix=table)

    LOGGER.debug("Participant validation ran the query\n%s", query_string)

    results = bq_utils.query(query_string)
    row_results = bq_utils.large_response_to_rowlist(results)

    result_list = []
    for item in row_results:
        person_id = item.get(consts.PERSON_ID_FIELD)
        value = item.get(field)

        value = _get_utf8_string(value)
        result_list.append((person_id, value))

    return result_list
Example #2
0
def get_location_pii(project, rdr_dataset, pii_dataset, hpo, table, field):
    """
    Get the actual value for a location field.

    ;param project:  The project name
    :param rdr_dataset:  The dataset to get actual location info from
    :param pii_dataset:  The dataset with a location_id.  location_id comes from
        a pii_address table.
    :param hpo:  site identifier used to prefix table names
    :param table:  table name to retrieve pii values for
    :param field:  The actual field to retrieve a value for:  either address_one,
        address_two, city, state, or zip.

    :return:  a list of [(person_id, value)] tuples.
    :raises:  oauth2client.client.HttpAccessTokenRefreshError,
              googleapiclient.errors.HttpError
    """
    location_ids = get_pii_values(
        project,
        pii_dataset,
        hpo,
        table,
        consts.LOCATION_ID_FIELD
    )

    location_id_list = []
    location_id_dict = {}
    for location_id in location_ids:
        location_id_list.append(location_id[1])
        location_id_dict[int(location_id[1])] = location_id[0]


    location_id_str = ', '.join(location_id_list)
    query_string = consts.PII_LOCATION_VALUES.format(
        project=project,
        dataset=rdr_dataset,
        field=field,
        id_list=location_id_str
    )

    LOGGER.debug("Participant validation ran the query\n%s", query_string)

    results = bq_utils.query(query_string)
    row_results = bq_utils.large_response_to_rowlist(results)

    result_list = []
    for item in row_results:
        location_id = item.get(consts.LOCATION_ID_FIELD)
        value = item.get(field)

        value = _get_utf8_string(value)

        person_id = location_id_dict.get(location_id, '')
        result_list.append((person_id, value))

    return result_list
Example #3
0
def get_rdr_match_values(project, dataset, table_name, concept_id):
    """
    Get the desired matching values from the combined observation table.

    This retrieves all possible matching values from the observation table
    used in participant matching.  Returned query data is limited to the
    person_id, observation_concept_id, and the value_as_string fields.

    :param project: The name of the project to query for rdr values
    :param dataset:  The name of the dataset to query for rdr values.  In this
        module, it is likely the validation dataset
    :param table_name:  The name of the table to query for rdr values.  In
        this module, it is likely the validation dataset
    :param concept_id: the id of the concept to verify from the RDR data

    :return:  A dictionary with observation_source_concept_id as the key.
        The value is a dictionary of person_ids with the associated value
        of the concept_id.
        For example:
        {person_id_1:  "email_address", person_id_2: "email_address"}
        {person_id_1: "first_name", person_id_2: "first_name"}
        {person_id_1: "last_name", person_id_2: "last_name"}
    :raises:  oauth2client.client.HttpAccessTokenRefreshError,
              googleapiclient.errors.HttpError
    """
    query_string = consts.PPI_OBSERVATION_VALUES.format(
        project=project,
        dataset=dataset,
        table=table_name,
        field_value=concept_id
    )

    LOGGER.debug("Participant validation ran the query\n%s", query_string)
    results = bq_utils.query(query_string)
    row_results = bq_utils.large_response_to_rowlist(results)

    result_dict = {}
    for item in row_results:
        person_id = item.get(consts.PERSON_ID_FIELD)
        value = item.get(consts.STRING_VALUE_FIELD)
        value = _get_utf8_string(value)

        exists = result_dict.get(person_id)
        if exists is None:
            result_dict[person_id] = value
        else:
            if exists == value:
                pass
            else:
                LOGGER.error("Trying to reset value for person_id\t%s.")

    return result_dict
Example #4
0
def get_ehr_person_values(project, dataset, table_name, column_name):
    """
    Get the desired matching values from the combined observation table.

    This retrieves all possible matching values from the observation table
    used in participant matching.  Returned query data is limited to the
    person_id, observation_concept_id, and the value_as_string fields.

    :param project: The name of the project to query for ehr values
    :param dataset:  The name of the dataset to query for ehr values
    :param table_name:  The name of the table to query for ehr values
    :param column_name:  String name of the field to return from the table

    :return:  A dictionary with observation_source_concept_id as the key.
        The value is a dictionary of person_ids with the associated value
        of the concept_id.
        For example:
        {person_id_1:  "email_address", person_id_2: "email_address"}
    :raises:  oauth2client.client.HttpAccessTokenRefreshError,
              googleapiclient.errors.HttpError
    """
    query_string = consts.EHR_PERSON_VALUES.format(
        project=project,
        dataset=dataset,
        table=table_name,
        field=column_name,
    )

    LOGGER.info("Participant validation ran the query\n%s", query_string)
    results = bq_utils.query(query_string)
    row_results = bq_utils.large_response_to_rowlist(results)

    field_type = _get_field_type(table_name, column_name)

    result_dict = {}
    for item in row_results:
        person_id = item.get(consts.PERSON_ID_FIELD)
        value = item.get(column_name)
        value = _get_string(value, field_type)

        exists = result_dict.get(person_id)
        if exists is None:
            result_dict[person_id] = value
        else:
            pass

    return result_dict
Example #5
0
def create_site_validation_report(project, dataset, hpo_list, bucket,
                                  filename):
    """
    Write the validation csv from the site validation table.

    :param project:  The project name
    :param dataset:  The dataset where the validtion table exists
    :param hpo_list:  A list of hpo strings to create a csv for
    :param bucket:  The bucket to write the csv to.
    :param filename:  The file name to give the csv report.
    """
    if not isinstance(hpo_list, list):
        hpo_list = [hpo_list]

    fields = [
        consts.PERSON_ID_FIELD, consts.FIRST_NAME_FIELD,
        consts.LAST_NAME_FIELD, consts.BIRTH_DATE_FIELD, consts.SEX_FIELD,
        consts.ADDRESS_MATCH_FIELD, consts.PHONE_NUMBER_FIELD,
        consts.EMAIL_FIELD, consts.ALGORITHM_FIELD
    ]

    fields_str = ','.join(fields) + '\n'

    # sets up a file stream to write to the bucket
    report_file = StringIO()
    report_file.write(fields_str)

    # write to the report file
    read_errors = 0
    for site in hpo_list:
        result_table = site + consts.VALIDATION_TABLE_SUFFIX
        query_string = consts.VALIDATION_RESULTS_VALUES.format(
            project=project, dataset=dataset, table=result_table)

        try:
            results = bq_utils.query(query_string, batch=True)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            LOGGER.exception(
                "Encountered an exception when selecting site records")
            report_file.write("Unable to report id validation match records "
                              "for site:\t{}.\n".format(site))
            read_errors += 1
            continue

        row_results = bq_utils.large_response_to_rowlist(results)
        for item in row_results:
            address_values = [
                item.get(consts.ADDRESS_ONE_FIELD),
                item.get(consts.ADDRESS_TWO_FIELD),
                item.get(consts.CITY_FIELD),
                item.get(consts.STATE_FIELD),
                item.get(consts.ZIP_CODE_FIELD)
            ]
            values = [
                str(item.get(consts.PERSON_ID_FIELD)),
                item.get(consts.FIRST_NAME_FIELD),
                item.get(consts.LAST_NAME_FIELD),
                item.get(consts.BIRTH_DATE_FIELD),
                item.get(consts.SEX_FIELD),
                get_address_match(address_values),
                item.get(consts.PHONE_NUMBER_FIELD),
                item.get(consts.EMAIL_FIELD),
                item.get(consts.ALGORITHM_FIELD)
            ]
            values_str = ','.join(values) + '\n'
            report_file.write(values_str)

    # reset the stream and write to the bucket
    report_file.seek(0)
    report_result = gcs_utils.upload_object(bucket, filename, report_file)
    report_file.close()

    LOGGER.info(f"Wrote validation report csv: {bucket}{filename}")
    return report_result, read_errors