Ejemplo n.º 1
0
def create_empty_dataset(project_id, dataset_id, snapshot_dataset_id):
    """
    Create the empty tables in the new snapshot dataset
    :param project_id:
    :param dataset_id:
    :param snapshot_dataset_id:
    :return:
    """
    create_dataset(
        project_id=project_id,
        dataset_id=snapshot_dataset_id,
        description='Snapshot of {dataset_id}'.format(dataset_id=dataset_id),
        overwrite_existing=True)
Ejemplo n.º 2
0
def create_sandbox_dataset(project_id, dataset_id):
    """
    A helper function create a sandbox dataset if the sandbox dataset doesn't exist
    :param project_id: project_id
    :param dataset_id: any dataset_id
    :return: the sandbox dataset_id
    """
    sandbox_dataset_id = get_sandbox_dataset_id(dataset_id)
    friendly_name = 'Sandbox for {dataset_id}'.format(dataset_id=dataset_id)
    description = 'Sandbox created for storing records affected by the cleaning rules applied to {dataset_id}'.format(
        dataset_id=dataset_id)
    create_dataset(project_id=project_id,
                   dataset_id=sandbox_dataset_id,
                   friendly_name=friendly_name,
                   description=description,
                   overwrite_existing=bq_consts.FALSE)

    return sandbox_dataset_id
Ejemplo n.º 3
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    LOGGER.info(f"Calling match_participants with:\n"
                f"project:\t{project}\n"
                f"rdr_dataset:\t{rdr_dataset}\n"
                f"ehr_dataset:\t{ehr_dataset}\n"
                f"dest_dataset_id:\t{dest_dataset_id}\n")

    ehr_tables = bq_utils.list_dataset_contents(ehr_dataset)

    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')
    LOGGER.info(
        f"Created new validation results dataset:\t{validation_dataset}")

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX,
                              field_list,
                              drop_existing=True,
                              dataset_id=validation_dataset)

    read_errors = 0
    write_errors = 0

    # validate first names
    for site in hpo_sites:
        LOGGER.info(f"Beginning identity validation for site: {site}")
        results = {}

        try:
            match_values = None
            match_values = _compare_name_fields(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_NAME_FIRST,
                                                consts.FIRST_NAME_FIELD,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.FIRST_NAME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.FIRST_NAME_FIELD)
            LOGGER.info(f"Validated first names for: {site}")

        # validate last names
        try:
            match_values = None
            match_values = _compare_name_fields(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_NAME_LAST,
                                                consts.LAST_NAME_FIELD,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.LAST_NAME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.LAST_NAME_FIELD)
            LOGGER.info(f"Validated last names for: {site}")

        # validate middle names
        try:
            match_values = None


#            match_values = _compare_name_fields(
#                project,
#                validation_dataset,
#                ehr_dataset,
#                site,
#                consts.OBS_PII_NAME_MIDDLE,
#                consts.MIDDLE_NAME_FIELD,
#                ehr_tables
#            )
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.MIDDLE_NAME_FIELD} at site: {site}"
            ),
            read_errors += 1
        else:
            # write middle name matches for hpo to table
            #            results = _add_matches_to_results(results, match_values, consts.MIDDLE_NAME_FIELD)
            LOGGER.info("Not validating middle names")

        # validate zip codes
        try:
            match_values = None
            match_values = _compare_zip_codes(
                project, validation_dataset, rdr_dataset, ehr_dataset, site,
                consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD,
                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.ZIP_CODE_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.ZIP_CODE_FIELD)
            LOGGER.info(f"Validated zip codes for: {site}")

        # validate city
        try:
            match_values = None
            match_values = _compare_cities(project, validation_dataset,
                                           rdr_dataset, ehr_dataset, site,
                                           consts.OBS_PII_STREET_ADDRESS_CITY,
                                           consts.CITY_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.CITY_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.ZIP_CODE_FIELD)
            LOGGER.info(f"Validated city names for: {site}")

        # validate state
        try:
            match_values = None
            match_values = _compare_states(project, validation_dataset,
                                           rdr_dataset, ehr_dataset, site,
                                           consts.OBS_PII_STREET_ADDRESS_STATE,
                                           consts.STATE_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.STATE_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.STATE_FIELD)
            LOGGER.info(f"Validated states for: {site}")

        # validate street addresses
        try:
            address_one_matches = None
            address_two_matches = None
            match_values = None
            address_one_matches, address_two_matches = _compare_street_addresses(
                project, validation_dataset, rdr_dataset, ehr_dataset, site,
                consts.OBS_PII_STREET_ADDRESS_ONE,
                consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD,
                consts.ADDRESS_TWO_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for fields: {consts.ADDRESS_ONE_FIELD}, {consts.ADDRESS_TWO_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, address_one_matches,
                                              consts.ADDRESS_ONE_FIELD)
            results = _add_matches_to_results(results, address_two_matches,
                                              consts.ADDRESS_TWO_FIELD)
            LOGGER.info(f"Validated street addresses for: {site}")

        # validate email addresses
        try:
            match_values = None
            match_values = _compare_email_addresses(
                project, validation_dataset, ehr_dataset, site,
                consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.EMAIL_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.EMAIL_FIELD)
            LOGGER.info(f"Validated email addresses for: {site}")

        # validate phone numbers
        try:
            match_values = None
            match_values = _compare_phone_numbers(project, validation_dataset,
                                                  ehr_dataset, site,
                                                  consts.OBS_PII_PHONE,
                                                  consts.PHONE_NUMBER_FIELD,
                                                  ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.PHONE_NUMBER_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.PHONE_NUMBER_FIELD)
            LOGGER.info(f"Validated phone numbers for: {site}")

        # validate genders
        try:
            match_values = None
            match_values = _compare_genders(project, validation_dataset,
                                            ehr_dataset, site,
                                            consts.OBS_PII_SEX, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.SEX_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.SEX_FIELD)
            LOGGER.info(f"Validated genders for: {site}")

        # validate birth dates
        try:
            match_values = None
            match_values = _compare_birth_dates(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_BIRTH_DATETIME,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.BIRTH_DATETIME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.BIRTH_DATE_FIELD)
            LOGGER.info(f"Validated birth dates for: {site}")

        LOGGER.info(f"Writing results to BQ table")
        # write dictionary to a table
        try:
            writers.write_to_result_table(project, validation_dataset, site,
                                          results)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            LOGGER.exception(
                f"Did not write site information to validation dataset:  {site}"
            )
            write_errors += 1

        LOGGER.info(f"Wrote validation results for site: {site}")

    LOGGER.info(f"FINISHED: Validation dataset created:  {validation_dataset}")

    if read_errors > 0:
        LOGGER.error(
            f"Encountered {read_errors} read errors creating validation dataset:\t{validation_dataset}"
        )

    if write_errors > 0:
        LOGGER.error(
            f"Encountered {write_errors} write errors creating validation dataset:\t{validation_dataset}"
        )

    return read_errors + write_errors
Ejemplo n.º 4
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    LOGGER.info(
        'Calling match_participants with:\n'
        'project:\t%s\n'
        'rdr_dataset:\t%s\n'
        'ehr_dataset:\t%s\n'
        'dest_dataset_id:\t%s\n', project, rdr_dataset, ehr_dataset,
        dest_dataset_id)

    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')
    LOGGER.info('Created new validation results dataset:\t%s',
                validation_dataset)

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX,
                              field_list,
                              drop_existing=True,
                              dataset_id=validation_dataset)

    read_errors = 0
    write_errors = 0
    results = {}

    # validate first names
    for site in hpo_sites:
        match_values, exc = _compare_name_fields(project, validation_dataset,
                                                 ehr_dataset, site,
                                                 consts.OBS_PII_NAME_FIRST,
                                                 consts.FIRST_NAME_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.FIRST_NAME_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.FIRST_NAME_FIELD)
                write_errors += 1

    LOGGER.info('Validated first names')

    # validate last names
    for site in hpo_sites:
        match_values, exc = _compare_name_fields(project, validation_dataset,
                                                 ehr_dataset, site,
                                                 consts.OBS_PII_NAME_LAST,
                                                 consts.LAST_NAME_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write last name matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.LAST_NAME_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.LAST_NAME_FIELD)
                write_errors += 1

    LOGGER.info('Validated last names')

    # validate middle names
    for site in hpo_sites:
        match_values, exc = _compare_name_fields(project, validation_dataset,
                                                 ehr_dataset, site,
                                                 consts.OBS_PII_NAME_MIDDLE,
                                                 consts.MIDDLE_NAME_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write middle name matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.MIDDLE_NAME_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.MIDDLE_NAME_FIELD)
                write_errors += 1

    LOGGER.info('Validated middle names')

    # validate zip codes
    for site in hpo_sites:
        match_values, exc = _compare_zip_codes(
            project, validation_dataset, rdr_dataset, ehr_dataset, site,
            consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write zip codes matces for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.ZIP_CODE_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.ZIP_CODE_FIELD)
                write_errors += 1

    LOGGER.info('Validated zip codes')

    # validate city
    for site in hpo_sites:
        match_values, exc = _compare_cities(project, validation_dataset,
                                            rdr_dataset, ehr_dataset, site,
                                            consts.OBS_PII_STREET_ADDRESS_CITY,
                                            consts.CITY_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write city matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.CITY_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.CITY_FIELD)
                write_errors += 1

    LOGGER.info('Validated city names')

    # validate state
    for site in hpo_sites:
        match_values, exc = _compare_states(
            project, validation_dataset, rdr_dataset, ehr_dataset, site,
            consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write state matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.STATE_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.STATE_FIELD)
                write_errors += 1

    LOGGER.info('Validated states')

    # validate street addresses
    for site in hpo_sites:
        address_one_matches, address_two_matches, exc = _compare_street_addresses(
            project, validation_dataset, rdr_dataset, ehr_dataset, site,
            consts.OBS_PII_STREET_ADDRESS_ONE,
            consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD,
            consts.ADDRESS_TWO_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write street address matches for hpo to table
            try:
                writers.append_to_result_table(site, address_one_matches,
                                               project, validation_dataset,
                                               consts.ADDRESS_ONE_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.ADDRESS_ONE_FIELD)
                write_errors += 1

            try:
                writers.append_to_result_table(site, address_two_matches,
                                               project, validation_dataset,
                                               consts.ADDRESS_TWO_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.ADDRESS_TWO_FIELD)
                write_errors += 1

    LOGGER.info('Validated street addresses')

    # validate email addresses
    for site in hpo_sites:
        match_values, exc = _compare_email_addresses(
            project, validation_dataset, ehr_dataset, site,
            consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write email matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.EMAIL_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.EMAIL_FIELD)
                write_errors += 1

    LOGGER.info('Validated email addresses')

    # validate phone numbers
    for site in hpo_sites:
        match_values, exc = _compare_phone_numbers(project, validation_dataset,
                                                   ehr_dataset, site,
                                                   consts.OBS_PII_PHONE,
                                                   consts.PHONE_NUMBER_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write phone number matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.PHONE_NUMBER_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.PHONE_NUMBER_FIELD)
                write_errors += 1

    LOGGER.info('Validated phone numbers')

    # validate genders
    for site in hpo_sites:
        match_values, exc = _compare_genders(project, validation_dataset,
                                             ehr_dataset, site,
                                             consts.OBS_PII_SEX)

        if exc is not None:
            read_errors += 1
        else:
            # write birthday match for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.SEX_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.SEX_FIELD)
                write_errors += 1

    LOGGER.info('Validated genders')

    # validate birth dates
    for site in hpo_sites:
        match_values, exc = _compare_birth_dates(project, validation_dataset,
                                                 ehr_dataset, site,
                                                 consts.OBS_PII_BIRTH_DATETIME)

        if exc is not None:
            read_errors += 1
        else:
            # write birthday match for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.BIRTH_DATE_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.BIRTH_DATE_FIELD)
                write_errors += 1

    LOGGER.info('Validated birth dates')

    # generate single clean record for each participant at each site
    for site in hpo_sites:
        try:
            writers.merge_fields_into_single_record(project,
                                                    validation_dataset, site)
            LOGGER.info('Merged participant match records')
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            write_errors += 1

        try:
            writers.remove_sparse_records(project, validation_dataset, site)
            LOGGER.info('Removed sparse participant match records')
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            write_errors += 1

        try:
            writers.change_nulls_to_missing_value(project, validation_dataset,
                                                  site)
            LOGGER.info(
                'Changed nulls to missing values in participant match records')
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            write_errors += 1

    LOGGER.info("Finished creating validation dataset")

    if read_errors > 0:
        LOGGER.error(
            "Encountered %d read errors creating validation dataset:\t%s",
            read_errors, validation_dataset)

    if write_errors > 0:
        LOGGER.error(
            "Encountered %d write errors creating validation dataset:\t%s",
            write_errors, validation_dataset)

    return results, read_errors + write_errors
Ejemplo n.º 5
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='',
            rdr_dataset=rdr_dataset,
            ehr_dataset=ehr_dataset
        ),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(
            site_name + consts.VALIDATION_TABLE_SUFFIX,
            field_list,
            drop_existing=True,
            dataset_id=validation_dataset
        )

    results = {}

    # validate first names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_FIRST,
            consts.FIRST_NAME_FIELD
        )

        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.FIRST_NAME_FIELD
        )

    # validate last names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_LAST,
            consts.LAST_NAME_FIELD
        )
        # write last name matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.LAST_NAME_FIELD
        )

    # validate middle names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_MIDDLE,
            consts.MIDDLE_NAME_FIELD
        )
        # write middle name matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.MIDDLE_NAME_FIELD
        )

    # validate zip codes
    for site in hpo_sites:
        match_values = _compare_zip_codes(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_ZIP,
            consts.ZIP_CODE_FIELD
        )
        # write zip codes matces for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.ZIP_CODE_FIELD
        )

    # validate city
    for site in hpo_sites:
        match_values = _compare_cities(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_CITY,
            consts.CITY_FIELD
        )
        # write city matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.CITY_FIELD
        )

    # validate state
    for site in hpo_sites:
        match_values = _compare_states(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_STATE,
            consts.STATE_FIELD
        )
        # write state matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.STATE_FIELD
        )

    # validate street addresses
    for site in hpo_sites:
        address_one_matches, address_two_matches = _compare_street_addresses(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_ONE,
            consts.OBS_PII_STREET_ADDRESS_TWO,
            consts.ADDRESS_ONE_FIELD,
            consts.ADDRESS_TWO_FIELD
        )
        # write street address matches for hpo to table
        writers.append_to_result_table(
            site,
            address_one_matches,
            project,
            validation_dataset,
            consts.ADDRESS_ONE_FIELD
        )
        writers.append_to_result_table(
            site,
            address_two_matches,
            project,
            validation_dataset,
            consts.ADDRESS_TWO_FIELD
        )

    # validate email addresses
    for site in hpo_sites:
        match_values = _compare_email_addresses(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_EMAIL_ADDRESS,
            consts.EMAIL_FIELD
        )
        # write email matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.EMAIL_FIELD
        )

    # validate phone numbers
    for site in hpo_sites:
        match_values = _compare_phone_numbers(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_PHONE,
            consts.PHONE_NUMBER_FIELD
        )
        # write phone number matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.PHONE_NUMBER_FIELD
        )

    # validate genders
    for site in hpo_sites:
        match_values = _compare_genders(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_SEX
        )
        # write birthday match for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.SEX_FIELD
        )

    # validate birth dates
    for site in hpo_sites:
        match_values = _compare_birth_dates(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_BIRTH_DATETIME
        )
        # write birthday match for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.BIRTH_DATE_FIELD
        )

    # generate single clean record for each participant at each site
    for site in hpo_sites:
        writers.merge_fields_into_single_record(project, validation_dataset, site)
        writers.remove_sparse_records(project, validation_dataset, site)
        writers.change_nulls_to_missing_value(project, validation_dataset, site)

    # generate hpo site reports
    for site in hpo_sites:
        bucket = gcs_utils.get_hpo_bucket(site)
        filename = os.path.join(
            consts.REPORT_DIRECTORY.format(date=date_string),
            consts.REPORT_TITLE
        )
        writers.create_site_validation_report(
            project, validation_dataset, [site], bucket, filename
        )

    # generate aggregate site report
    bucket = gcs_utils.get_drc_bucket()
    filename = os.path.join(validation_dataset, consts.REPORT_TITLE)
    writers.create_site_validation_report(
        project, validation_dataset, hpo_sites, bucket, filename
    )

    return results