コード例 #1
0
ファイル: main.py プロジェクト: all-of-us/curation
def run_retraction_cron():
    project_id = bq_utils.app_identity.get_application_id()
    output_project_id = bq_utils.get_output_project_id()
    hpo_id = bq_utils.get_retraction_hpo_id()
    retraction_type = bq_utils.get_retraction_type()
    pid_table_id = bq_utils.get_retraction_pid_table_id()
    sandbox_dataset_id = bq_utils.get_retraction_sandbox_dataset_id()

    # retract from bq
    dataset_ids = bq_utils.get_retraction_dataset_ids()
    logging.info(f"Dataset id/s to target from env variable: {dataset_ids}")
    logging.info(f"Running retraction on BQ datasets")
    if output_project_id:
        # retract from output dataset
        retract_data_bq.run_bq_retraction(output_project_id, sandbox_dataset_id,
                                          project_id, pid_table_id, hpo_id,
                                          dataset_ids, retraction_type)
    # retract from default dataset
    retract_data_bq.run_bq_retraction(project_id, sandbox_dataset_id,
                                      project_id, pid_table_id, hpo_id,
                                      dataset_ids, retraction_type)
    logging.info(f"Completed retraction on BQ datasets")

    # retract from gcs
    folder = bq_utils.get_retraction_submission_folder()
    logging.info(f"Submission folder/s to target from env variable: {folder}")
    logging.info(f"Running retraction from internal bucket folders")
    retract_data_gcs.run_gcs_retraction(project_id,
                                        sandbox_dataset_id,
                                        pid_table_id,
                                        hpo_id,
                                        folder,
                                        force_flag=True)
    logging.info(f"Completed retraction from internal bucket folders")
    return 'retraction-complete'
コード例 #2
0
    def test_integration_queries_to_retract_from_fake_dataset(
            self, mock_list_datasets, mock_is_ehr_dataset,
            mock_is_unioned_dataset, mock_is_combined_dataset,
            mock_is_deid_dataset):
        mock_list_datasets.return_value = [self.bq_dataset_id]
        mock_is_deid_dataset.return_value = False
        mock_is_combined_dataset.return_value = False
        mock_is_unioned_dataset.return_value = False
        mock_is_ehr_dataset.return_value = True

        # create and load person_ids to pid table
        bq_utils.create_table(self.pid_table_id,
                              retract_data_bq.PID_TABLE_FIELDS,
                              drop_existing=True,
                              dataset_id=self.bq_dataset_id)
        bq_formatted_insert_values = ', '.join([
            '(%s, %s)' % (person_id, research_id)
            for (person_id, research_id) in self.person_research_ids
        ])
        q = INSERT_PID_TABLE.format(
            dataset_id=self.bq_dataset_id,
            pid_table_id=self.pid_table_id,
            person_research_ids=bq_formatted_insert_values)
        bq_utils.query(q)

        job_ids = []
        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(
                dataset_id=self.bq_dataset_id,
                table_id=hpo_table,
                pid_table_id=self.pid_table_id)
            logging.info('Preparing to load table %s.%s' %
                         (self.bq_dataset_id, hpo_table))
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id),
                                        cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id,
                                           cdm_table,
                                           dataset_id=self.bq_dataset_id)
            logging.info('Loading table %s.%s' %
                         (self.bq_dataset_id, hpo_table))
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)
        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0,
                         'NYC five person load job did not complete')
        logging.info('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            result = bq_utils.query(row_count_queries[table])
            expected_row_count[table] = int(result['totalRows'])

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_before_retraction = {}
        for row in result:
            row_count_before_retraction[row['table_id']] = row['row_count']

        # perform retraction
        retract_data_bq.run_bq_retraction(self.test_project_id,
                                          self.bq_dataset_id,
                                          self.test_project_id,
                                          self.pid_table_id, self.hpo_id,
                                          self.dataset_ids,
                                          self.retraction_type)

        # find actual deleted rows
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_after_retraction = {}
        for row in result:
            row_count_after_retraction[row['table_id']] = row['row_count']
        for table in expected_row_count:
            self.assertEqual(
                expected_row_count[table], row_count_before_retraction[table] -
                row_count_after_retraction[table])
コード例 #3
0
    def test_integration_queries_to_retract_from_fake_dataset(
            self, mock_list_datasets, mock_is_ehr_dataset,
            mock_is_unioned_dataset, mock_is_combined_dataset,
            mock_is_deid_dataset):
        mock_list_datasets.return_value = [self.bq_dataset_id]
        mock_is_deid_dataset.return_value = False
        mock_is_combined_dataset.return_value = False
        mock_is_unioned_dataset.return_value = False
        mock_is_ehr_dataset.return_value = True

        # create and load person_ids to pid table
        bq.create_tables(
            self.client,
            self.test_project_id, [
                f'{self.test_project_id}.{self.bq_dataset_id}.{self.pid_table_id}'
            ],
            exists_ok=False,
            fields=[rbq.PID_TABLE_FIELDS])
        bq_formatted_insert_values = ', '.join([
            f'({person_id}, {research_id})'
            for (person_id, research_id) in self.person_research_ids
        ])
        q = INSERT_PID_TABLE.format(
            dataset_id=self.bq_dataset_id,
            pid_table_id=self.pid_table_id,
            person_research_ids=bq_formatted_insert_values)
        job = self.client.query(q)
        job.result()

        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = f'{self.hpo_id}_{cdm_table}'
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(
                dataset_id=self.bq_dataset_id,
                table_id=hpo_table,
                pid_table_id=self.pid_table_id)
            logging.info(
                f'Preparing to load table {self.bq_dataset_id}.{hpo_table}')
            with open(cdm_file, 'rb') as f:
                job_config = bigquery.LoadJobConfig()
                job_config.source_format = bigquery.SourceFormat.CSV
                job_config.skip_leading_rows = 1
                job_config.write_disposition = 'WRITE_EMPTY'
                job_config.schema = bq.get_table_schema(cdm_table)
                load_job = self.client.load_table_from_file(
                    f,
                    f'{self.test_project_id}.{self.bq_dataset_id}.{hpo_table}',
                    job_config=job_config)
                load_job.result()
        logging.info('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            job = self.client.query(row_count_queries[table])
            result = job.result()
            expected_row_count[table] = result.to_dataframe()['count'].to_list(
            )[0]

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        job = self.client.query(q)
        result = job.result().to_dataframe()
        row_counts_before_retraction = pd.Series(
            result.row_count.values, index=result.table_id).to_dict()

        # perform retraction
        rbq.run_bq_retraction(self.test_project_id, self.bq_dataset_id,
                              self.test_project_id, self.pid_table_id,
                              self.hpo_id, self.dataset_ids,
                              self.retraction_type)

        # find actual deleted rows
        job = self.client.query(q)
        result = job.result().to_dataframe()
        row_counts_after_retraction = pd.Series(
            result.row_count.values, index=result.table_id).to_dict()

        for table in expected_row_count:
            self.assertEqual(
                expected_row_count[table],
                row_counts_before_retraction[table] -
                row_counts_after_retraction[table])