Ejemplo n.º 1
0
 def tearDown(self):
     self._empty_bucket()
     to_delete_list = gcs_utils.list_bucket(gcs_utils.get_drc_bucket())
     for bucket_item in to_delete_list:
         gcs_utils.delete_object(gcs_utils.get_drc_bucket(),
                                 bucket_item['name'])
     self.testbed.deactivate()
Ejemplo n.º 2
0
def get_most_recent(app_id=None, drc_bucket=None, report_for=None):
    """
    Query audit logs for paths to the most recent datasources.json files in the DRC bucket.

    Note: Results are cached in a local json file to avoid unnecessary queries.
    :param app_id: identifies the GCP project
    :param drc_bucket: identifies the DRC bucket
    :param report_for: denotes which query to use b/w achilles and results
    :return: list of dict with keys `file_path`, `upload_timestamp`
    """
    if app_id is None:
        app_id = app_identity.get_application_id()
    if drc_bucket is None:
        drc_bucket = gcs_utils.get_drc_bucket()
        if report_for == common.REPORT_FOR_ACHILLES:
            if not os.path.exists(common.LATEST_REPORTS_JSON):
                query = LATEST_REPORTS_QUERY.format(app_id=app_id, drc_bucket=drc_bucket, year=common.LOG_YEAR)
                query_job = bq_utils.query(query)
                result = bq_utils.response2rows(query_job)
                with open(common.LATEST_REPORTS_JSON, 'w') as fp:
                    json.dump(result, fp, sort_keys=True, indent=4)
            with open(common.LATEST_REPORTS_JSON, 'r') as fp:
                return json.load(fp)
        elif report_for == common.REPORT_FOR_RESULTS:
            if not os.path.exists(common.LATEST_RESULTS_JSON):
                query = LATEST_RESULTS_QUERY.format(app_id=app_id, drc_bucket=drc_bucket, year=common.LOG_YEAR)
                query_job = bq_utils.query(query)
                result = bq_utils.response2rows(query_job)
                with open(common.LATEST_RESULTS_JSON, 'w') as fp:
                    json.dump(result, fp, sort_keys=True, indent=4)
            with open(common.LATEST_RESULTS_JSON, 'r') as fp:
                return json.load(fp)
Ejemplo n.º 3
0
def run_retraction_cron():
    project_id = bq_utils.app_identity.get_application_id()
    hpo_id = bq_utils.get_retraction_hpo_id()
    person_ids_file = bq_utils.get_retraction_person_ids_file_name()
    research_ids_file = bq_utils.get_retraction_research_ids_file_name()
    person_ids = retract_data_bq.extract_pids_from_file(person_ids_file)
    research_ids = retract_data_bq.extract_pids_from_file(research_ids_file)
    logging.info('Running retraction on research_ids')
    retract_data_bq.run_retraction(project_id,
                                   research_ids,
                                   hpo_id,
                                   deid_flag=True)
    logging.info('Completed retraction on research_ids')
    logging.info('Running retraction on person_ids')
    retract_data_bq.run_retraction(project_id,
                                   person_ids,
                                   hpo_id,
                                   deid_flag=False)
    logging.info('Completed retraction on person_ids')
    bucket = gcs_utils.get_drc_bucket()
    hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)
    logging.info('Running retraction from bucket folders')
    retract_data_gcs.run_retraction(person_ids,
                                    bucket,
                                    hpo_id,
                                    hpo_bucket,
                                    folder=None,
                                    force_flag=True)
    logging.info('Completed retraction from bucket folders')
    return 'retraction-complete'
Ejemplo n.º 4
0
    def test_copy_five_persons(self, mock_check_cron):
        # upload all five_persons files
        for cdm_pathfile in test_util.FIVE_PERSONS_FILES:
            test_filename: str = os.path.basename(cdm_pathfile)

            blob_name: str = f'{self.folder_prefix}{test_filename}'
            test_blob = self.storage_bucket.blob(blob_name)
            test_blob.upload_from_filename(cdm_pathfile)

            blob_name: str = f'{self.folder_prefix}{self.folder_prefix}{test_filename}'
            test_blob = self.storage_bucket.blob(blob_name)
            test_blob.upload_from_filename(cdm_pathfile)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.COPY_HPO_FILES_URL)
            prefix = test_util.FAKE_HPO_ID + '/' + self.hpo_bucket + '/' + self.folder_prefix
            expected_bucket_items = [
                prefix + item.split(os.sep)[-1]
                for item in test_util.FIVE_PERSONS_FILES
            ]
            expected_bucket_items.extend([
                prefix + self.folder_prefix + item.split(os.sep)[-1]
                for item in test_util.FIVE_PERSONS_FILES
            ])

            list_bucket_result = gcs_utils.list_bucket(
                gcs_utils.get_drc_bucket())
            actual_bucket_items = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_bucket_items),
                                set(actual_bucket_items))
Ejemplo n.º 5
0
 def setUp(self):
     self.app_id = app_identity.get_application_id()
     self.dataset_id = bq_utils.get_dataset_id()
     self.bucket = gcs_utils.get_drc_bucket()
     test_util.empty_bucket(self.bucket)
     test_util.delete_all_tables(self.dataset_id)
     self.load_test_data(hpo_id=HPO_NYC)
Ejemplo n.º 6
0
def copy_files(hpo_id):
    """copies over files from hpo bucket to drc bucket

    :hpo_id: hpo from which to copy
    :return: json string indicating the job has finished
    """
    hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)
    drc_private_bucket = gcs_utils.get_drc_bucket()

    bucket_items = list_bucket(hpo_bucket)

    ignored_items = 0
    filtered_bucket_items = []
    for item in bucket_items:
        item_root = item['name'].split('/')[0] + '/'
        if item_root.lower() in common.IGNORE_DIRECTORIES:
            ignored_items += 1
        else:
            filtered_bucket_items.append(item)

    logging.info("Ignoring %d items in %s", ignored_items, hpo_bucket)

    prefix = hpo_id + '/' + hpo_bucket + '/'

    for item in filtered_bucket_items:
        item_name = item['name']
        gcs_utils.copy_object(source_bucket=hpo_bucket,
                              source_object_id=item_name,
                              destination_bucket=drc_private_bucket,
                              destination_object_id=prefix + item_name)

    return '{"copy-status": "done"}'
Ejemplo n.º 7
0
def write_results_to_drc_bucket(project, validation_dataset=None):
    """
    Write the results of participant matching to the drc bucket.

    :param project: a string representing the project name
    :param validation_dataset:  the identifier for the match values
        destination dataset

    :return: None
    :raises:  RuntimeError if validation_dataset is not defined.
    """
    LOGGER.info('Writing to the DRC bucket')
    if validation_dataset is None:
        LOGGER.error('Validation_dataset name is not defined.')
        raise RuntimeError('validation_dataset name cannot be None.')

    date_string = _get_date_string(validation_dataset)
    hpo_sites = readers.get_hpo_site_names()
    # generate aggregate site report
    bucket = gcs_utils.get_drc_bucket()
    filename = os.path.join(validation_dataset,
                            consts.REPORT_DIRECTORY.format(date=date_string),
                            consts.REPORT_TITLE)
    _, errors = writers.create_site_validation_report(project,
                                                      validation_dataset,
                                                      hpo_sites, bucket,
                                                      filename)

    if errors > 0:
        LOGGER.error(
            f"Encountered {errors} read errors when writing drc report")
Ejemplo n.º 8
0
    def test_copy_five_persons(self, mock_check_cron):
        # upload all five_persons files
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=self.folder_prefix)
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=self.folder_prefix +
                                       self.folder_prefix)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.COPY_HPO_FILES_URL)
            prefix = test_util.FAKE_HPO_ID + '/' + self.hpo_bucket + '/' + self.folder_prefix
            expected_bucket_items = [
                prefix + item.split(os.sep)[-1]
                for item in test_util.FIVE_PERSONS_FILES
            ]
            expected_bucket_items.extend([
                prefix + self.folder_prefix + item.split(os.sep)[-1]
                for item in test_util.FIVE_PERSONS_FILES
            ])

            list_bucket_result = gcs_utils.list_bucket(
                gcs_utils.get_drc_bucket())
            actual_bucket_items = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_bucket_items),
                                set(actual_bucket_items))
Ejemplo n.º 9
0
 def tearDown(self):
     self._empty_bucket()
     bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
     test_util.empty_bucket(bucket_nyc)
     test_util.empty_bucket(gcs_utils.get_drc_bucket())
     test_util.delete_all_tables(self.bigquery_dataset_id)
     self.testbed.deactivate()
Ejemplo n.º 10
0
 def setUp(self):
     super(CombineEhrRdrTest, self).setUp()
     self.APP_ID = bq_utils.app_identity.get_application_id()
     self.ehr_dataset_id = bq_utils.get_dataset_id()
     self.rdr_dataset_id = bq_utils.get_rdr_dataset_id()
     self.combined_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
     self.drc_bucket = gcs_utils.get_drc_bucket()
     test_util.delete_all_tables(self.combined_dataset_id)
Ejemplo n.º 11
0
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.dataset_id = bq_utils.get_dataset_id()
        self.bucket: str = gcs_utils.get_drc_bucket()
        self.storage_client = StorageClient(self.project_id)

        self.storage_client.empty_bucket(self.bucket)
        test_util.delete_all_tables(self.dataset_id)
        self.load_test_data(hpo_id=HPO_NYC)
Ejemplo n.º 12
0
    def _test_site_generation(self, mock_check_cron):
        self._empty_drc_bucket()
        self._empty_hpo_buckets()
        with main.app.test_request_context():
            result = main._generate_site()
            self.assertEquals(result, 'okay')

            # verify that page worked
            bucket = gcs_utils.get_drc_bucket()
            expected_files = map(lambda n: n + '.html',
                                 main.PAGE_NAMES) + [common.LOG_JSON]
            file_count = 0
            for stat in gcs_utils.list_bucket(bucket):
                filename = stat['name']
                self.assertIn(filename, expected_files)
                file_count += 1
            self.assertEquals(file_count, len(expected_files))
Ejemplo n.º 13
0
def copy_files(hpo_id):
    """copies over files from hpo bucket to drc bucket

    :hpo_id: hpo from which to copy

    """
    hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)
    drc_private_bucket = gcs_utils.get_drc_bucket()

    bucket_items = gcs_utils.list_bucket(hpo_bucket)

    prefix = hpo_id + '/' + hpo_bucket + '/'

    for item in bucket_items:
        item_name = item['name']
        gcs_utils.copy_object(source_bucket=hpo_bucket,
                              source_object_id=item_name,
                              destination_bucket=drc_private_bucket,
                              destination_object_id=prefix + item_name)

    return '{"copy-status": "done"}'
Ejemplo n.º 14
0
def _generate_site():
    """
    Construct html pages for each report, data_model, file_transfer and index.

    :param : 
    :return: returns 'okay' if succesful. logs critical error otherwise.
    """
    bucket = gcs_utils.get_drc_bucket()

    for page_name in PAGE_NAMES:
        # generate the page
        html = to_html(page_name)

        # write it to the drc shared bucket
        file_name = page_name + '.html'
        fp = StringIO.StringIO(html)
        gcs_utils.upload_object(bucket, file_name, fp)

    # aggregate result logs and write to bucket
    full_result_log = get_full_result_log()
    content = json.dumps(full_result_log)
    fp = StringIO.StringIO(content)
    gcs_utils.upload_object(bucket, LOG_JSON, fp)
    return 'okay'
Ejemplo n.º 15
0
 def tearDown(self):
     self.storage_client.empty_bucket(self.hpo_bucket)
     bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
     self.storage_client.empty_bucket(bucket_nyc)
     self.storage_client.empty_bucket(gcs_utils.get_drc_bucket())
     test_util.delete_all_tables(self.bigquery_dataset_id)
Ejemplo n.º 16
0
 def tearDown(self):
     self._empty_bucket()
     bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
     test_util.empty_bucket(bucket_nyc)
     test_util.empty_bucket(gcs_utils.get_drc_bucket())
     self.testbed.deactivate()
Ejemplo n.º 17
0
 def setUp(self):
     self.ehr_dataset_id = bq_utils.get_dataset_id()
     self.rdr_dataset_id = bq_utils.get_rdr_dataset_id()
     self.combined_dataset_id = bq_utils.get_combined_dataset_id()
     self.drc_bucket = gcs_utils.get_drc_bucket()
     test_util.delete_all_tables(self.combined_dataset_id)
Ejemplo n.º 18
0
 def _empty_drc_bucket(self):
     bucket = gcs_utils.get_drc_bucket()
     self._empty_bucket(bucket)
Ejemplo n.º 19
0
def write_to_result_table(project, dataset, site, match_values):
    """
    Append items in match_values to the table generated from site name.

    Attempts to limit the insert query to less than 1MB.

    :param site:  string identifier for the hpo site.
    :param match_values:  dictionary of person_ids and match values for a field
    :param project:  the project BigQuery project name
    :param dataset:  name of the dataset containing the table to append to

    :return: query results value
    :raises:  oauth2client.client.HttpAccessTokenRefreshError,
              googleapiclient.errors.HttpError
    """
    if not match_values:
        LOGGER.info(f"No values to insert for site: {site}")
        return None

    result_table = site + consts.VALIDATION_TABLE_SUFFIX
    bucket = gcs_utils.get_drc_bucket()
    path = dataset + '/intermediate_results/' + site + '.csv'

    field_list = [consts.PERSON_ID_FIELD]
    field_list.extend(consts.VALIDATION_FIELDS)
    field_list.append(consts.ALGORITHM_FIELD)

    results = StringIO()
    field_list_str = ','.join(field_list) + '\n'
    results.write(field_list_str)

    LOGGER.info(f"Generating csv values to write to storage for site: {site}")

    for person_key, person_values in match_values.items():
        str_list = [str(person_key)]
        for field in consts.VALIDATION_FIELDS:
            value = str(person_values.get(field, consts.MISSING))
            str_list.append(value)

        str_list.append(consts.YES)
        val_str = ','.join(str_list)
        results.write(val_str + '\n')

    LOGGER.info(f"Writing csv file to cloud storage for site: {site}")

    # write results
    results.seek(0)
    gcs_utils.upload_object(bucket, path, results)
    results.close()

    LOGGER.info(
        f"Wrote {len(match_values)} items to cloud storage for site: {site}")

    # wait on results to be written

    schema_path = os.path.join(fields_path, 'identity_match.json')

    LOGGER.info(
        f"Beginning load of identity match values from csv into BigQuery "
        "for site: {site}")
    try:
        # load csv file into bigquery
        results = bq_utils.load_csv(schema_path,
                                    'gs://' + bucket + '/' + path,
                                    project,
                                    dataset,
                                    result_table,
                                    write_disposition=consts.WRITE_TRUNCATE)

        # ensure the load job finishes
        query_job_id = results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
        if incomplete_jobs != []:
            raise bq_utils.BigQueryJobWaitError(incomplete_jobs)

    except (oauth2client.client.HttpAccessTokenRefreshError,
            googleapiclient.errors.HttpError):
        LOGGER.exception(
            f"Encountered an exception when loading records from csv for site: {site}"
        )
        raise

    LOGGER.info(f"Loaded match values for site: {site}")

    return results
Ejemplo n.º 20
0
import json
import os

import gcs_utils
from tools.consolidated_reports import query_reports
from io import open

DRC_BUCKET_PATH = 'gs://%s/' % gcs_utils.get_drc_bucket()
DATASOURCES_PATH = 'curation_report/data/datasources.json'


def get_hpo_id(p):
    rel_path = p[len(DRC_BUCKET_PATH):]
    return rel_path[:rel_path.index('/')]


def get_report_path(p, hpo_id):
    return p.replace('datasources.json', hpo_id)


def get_submission_name(p):
    parts = p.split('/')
    for i in range(0, len(parts)):
        part = parts[i]
        if part == 'curation_report':
            return parts[i - 1]
    raise RuntimeError('Invalid submission path: %s' % p)


def transform_bq_list(uploads):
    """
Ejemplo n.º 21
0
def run_gcs_retraction(project_id, sandbox_dataset_id, pid_table_id, hpo_id,
                       folder, force_flag):
    """
    Retract from a folder/folders in a GCS bucket all records associated with a pid

    :param project_id: project contaning the sandbox dataset
    :param sandbox_dataset_id: dataset containing the pid_table
    :param pid_table_id: table containing the person_ids whose data needs to be retracted
    :param hpo_id: hpo_id of the site to run retraction on
    :param folder: the site's submission folder; if set to 'all_folders', retract from all folders by the site
    :param force_flag: if False then prompt for each file
    :return: metadata for each object updated in order to retract as a list of lists
    """

    # extract the pids
    pids = extract_pids_from_table(project_id, sandbox_dataset_id,
                                   pid_table_id)

    bucket = gcs_utils.get_drc_bucket()
    logger.info('Retracting from bucket %s' % bucket)

    site_bucket = gcs_utils.get_hpo_bucket(hpo_id)
    full_bucket_path = bucket + '/' + hpo_id + '/' + site_bucket
    folder_prefixes = gcs_utils.list_bucket_prefixes(full_bucket_path)
    # retract from latest folders first
    folder_prefixes.sort(reverse=True)

    result_dict = {}
    if folder == 'all_folders':
        to_process_folder_list = folder_prefixes
    else:
        folder_path = full_bucket_path + '/' + folder if folder[
            -1] == '/' else full_bucket_path + '/' + folder + '/'

        if folder_path in folder_prefixes:
            to_process_folder_list = [folder_path]
        else:
            logger.info('Folder %s does not exist in %s. Exiting' %
                        (folder, full_bucket_path))
            return result_dict

    logger.info("Retracting data from the following folders:")
    logger.info([
        bucket + '/' + folder_prefix
        for folder_prefix in to_process_folder_list
    ])

    for folder_prefix in to_process_folder_list:
        logger.info('Processing gs://%s/%s' % (bucket, folder_prefix))
        # separate cdm from the unknown (unexpected) files
        bucket_items = gcs_utils.list_bucket_dir(bucket + '/' +
                                                 folder_prefix[:-1])
        found_files = []
        folder_items = [
            item['name'].split('/')[-1] for item in bucket_items
            if item['name'].startswith(folder_prefix)
        ]
        for item in folder_items:
            # Only retract from CDM or PII files
            item = item.lower()
            if item in resources.CDM_FILES or item in common.PII_FILES:
                found_files.append(item)

        logger.info('Found the following files to retract data from:')
        logger.info([
            bucket + '/' + folder_prefix + file_name
            for file_name in found_files
        ])

        logger.info("Proceed?")
        if force_flag:
            logger.info(
                "Attempting to force retract for folder %s in bucket %s" %
                (folder_prefix, bucket))
            response = "Y"
        else:
            # Make sure user types Y to proceed
            response = get_response()
        if response == "Y":
            folder_upload_output = retract(pids, bucket, found_files,
                                           folder_prefix, force_flag)
            result_dict[folder_prefix] = folder_upload_output
            logger.info("Retraction completed for folder %s/%s " %
                        (bucket, folder_prefix))
        elif response.lower() == "n":
            logger.info("Skipping folder %s" % folder_prefix)
    logger.info("Retraction from GCS complete")
    return result_dict
Ejemplo n.º 22
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='',
            rdr_dataset=rdr_dataset,
            ehr_dataset=ehr_dataset
        ),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(
            site_name + consts.VALIDATION_TABLE_SUFFIX,
            field_list,
            drop_existing=True,
            dataset_id=validation_dataset
        )

    results = {}

    # validate first names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_FIRST,
            consts.FIRST_NAME_FIELD
        )

        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.FIRST_NAME_FIELD
        )

    # validate last names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_LAST,
            consts.LAST_NAME_FIELD
        )
        # write last name matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.LAST_NAME_FIELD
        )

    # validate middle names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_MIDDLE,
            consts.MIDDLE_NAME_FIELD
        )
        # write middle name matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.MIDDLE_NAME_FIELD
        )

    # validate zip codes
    for site in hpo_sites:
        match_values = _compare_zip_codes(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_ZIP,
            consts.ZIP_CODE_FIELD
        )
        # write zip codes matces for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.ZIP_CODE_FIELD
        )

    # validate city
    for site in hpo_sites:
        match_values = _compare_cities(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_CITY,
            consts.CITY_FIELD
        )
        # write city matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.CITY_FIELD
        )

    # validate state
    for site in hpo_sites:
        match_values = _compare_states(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_STATE,
            consts.STATE_FIELD
        )
        # write state matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.STATE_FIELD
        )

    # validate street addresses
    for site in hpo_sites:
        address_one_matches, address_two_matches = _compare_street_addresses(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_ONE,
            consts.OBS_PII_STREET_ADDRESS_TWO,
            consts.ADDRESS_ONE_FIELD,
            consts.ADDRESS_TWO_FIELD
        )
        # write street address matches for hpo to table
        writers.append_to_result_table(
            site,
            address_one_matches,
            project,
            validation_dataset,
            consts.ADDRESS_ONE_FIELD
        )
        writers.append_to_result_table(
            site,
            address_two_matches,
            project,
            validation_dataset,
            consts.ADDRESS_TWO_FIELD
        )

    # validate email addresses
    for site in hpo_sites:
        match_values = _compare_email_addresses(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_EMAIL_ADDRESS,
            consts.EMAIL_FIELD
        )
        # write email matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.EMAIL_FIELD
        )

    # validate phone numbers
    for site in hpo_sites:
        match_values = _compare_phone_numbers(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_PHONE,
            consts.PHONE_NUMBER_FIELD
        )
        # write phone number matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.PHONE_NUMBER_FIELD
        )

    # validate genders
    for site in hpo_sites:
        match_values = _compare_genders(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_SEX
        )
        # write birthday match for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.SEX_FIELD
        )

    # validate birth dates
    for site in hpo_sites:
        match_values = _compare_birth_dates(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_BIRTH_DATETIME
        )
        # write birthday match for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.BIRTH_DATE_FIELD
        )

    # generate single clean record for each participant at each site
    for site in hpo_sites:
        writers.merge_fields_into_single_record(project, validation_dataset, site)
        writers.remove_sparse_records(project, validation_dataset, site)
        writers.change_nulls_to_missing_value(project, validation_dataset, site)

    # generate hpo site reports
    for site in hpo_sites:
        bucket = gcs_utils.get_hpo_bucket(site)
        filename = os.path.join(
            consts.REPORT_DIRECTORY.format(date=date_string),
            consts.REPORT_TITLE
        )
        writers.create_site_validation_report(
            project, validation_dataset, [site], bucket, filename
        )

    # generate aggregate site report
    bucket = gcs_utils.get_drc_bucket()
    filename = os.path.join(validation_dataset, consts.REPORT_TITLE)
    writers.create_site_validation_report(
        project, validation_dataset, hpo_sites, bucket, filename
    )

    return results