Beispiel #1
0
def run_retraction_cron():
    project_id = bq_utils.app_identity.get_application_id()
    output_project_id = bq_utils.get_output_project_id()
    hpo_id = bq_utils.get_retraction_hpo_id()
    retraction_type = bq_utils.get_retraction_type()
    pid_table_id = bq_utils.get_retraction_pid_table_id()
    sandbox_dataset_id = bq_utils.get_retraction_sandbox_dataset_id()

    # retract from bq
    dataset_ids = bq_utils.get_retraction_dataset_ids()
    logging.info(f"Dataset id/s to target from env variable: {dataset_ids}")
    logging.info(f"Running retraction on BQ datasets")
    if output_project_id:
        # retract from output dataset
        retract_data_bq.run_bq_retraction(output_project_id, sandbox_dataset_id,
                                          project_id, pid_table_id, hpo_id,
                                          dataset_ids, retraction_type)
    # retract from default dataset
    retract_data_bq.run_bq_retraction(project_id, sandbox_dataset_id,
                                      project_id, pid_table_id, hpo_id,
                                      dataset_ids, retraction_type)
    logging.info(f"Completed retraction on BQ datasets")

    # retract from gcs
    folder = bq_utils.get_retraction_submission_folder()
    logging.info(f"Submission folder/s to target from env variable: {folder}")
    logging.info(f"Running retraction from internal bucket folders")
    retract_data_gcs.run_gcs_retraction(project_id,
                                        sandbox_dataset_id,
                                        pid_table_id,
                                        hpo_id,
                                        folder,
                                        force_flag=True)
    logging.info(f"Completed retraction from internal bucket folders")
    return 'retraction-complete'
    def test_integration_five_person_data_retraction(self, mock_hpo_bucket,
                                                     mock_bucket,
                                                     mock_extract_pids):
        mock_hpo_bucket.return_value = self.site_bucket
        mock_bucket.return_value = self.bucket
        mock_extract_pids.return_value = self.pids
        expected_lines_post = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            # generate results files
            file_name = file_path.split('/')[-1]
            table_name = file_name.split('.')[0]
            expected_lines_post[file_name] = []
            with open(file_path, 'rb') as f:
                # skip header
                next(f)
                expected_lines_post[file_name] = []
                for line in f:
                    line = line.strip()
                    if line != b'':
                        if not ((table_name in rd.PID_IN_COL1 and
                                 int(line.split(b",")[0]) in self.pids) or
                                (table_name in rd.PID_IN_COL2 and
                                 int(line.split(b",")[1]) in self.pids)):
                            expected_lines_post[file_name].append(line)

                # write file to cloud for testing
                blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name)
                blob.upload_from_file(f, rewind=True, content_type='text/csv')
                blob = self.gcs_bucket.blob(self.folder_prefix_2 + file_name)
                blob.upload_from_file(f, rewind=True, content_type='text/csv')

        rd.run_gcs_retraction(self.project_id,
                              self.sandbox_dataset_id,
                              self.pid_table_id,
                              self.hpo_id,
                              folder='all_folders',
                              force_flag=True,
                              bucket=self.bucket,
                              site_bucket=self.site_bucket)

        total_lines_post = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            file_name = file_path.split('/')[-1]
            blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name)
            actual_result_contents = blob.download_as_string().split(b'\n')
            # convert to list and remove header and last list item since it is a newline
            total_lines_post[file_name] = actual_result_contents[1:-1]

        for key in expected_lines_post:
            self.assertListEqual(expected_lines_post[key],
                                 total_lines_post[key])
    def test_integration_five_person_data_retraction_skip(
            self, mock_hpo_bucket, mock_bucket, mock_extract_pids):
        mock_hpo_bucket.return_value = self.site_bucket
        mock_bucket.return_value = self.bucket
        mock_extract_pids.return_value = self.skip_pids
        self.folder_prefix_1 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_1
        self.folder_prefix_2 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_2
        lines_to_remove = {}
        total_lines_prior = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            # generate results files
            file_name = file_path.split('/')[-1]
            table_name = file_name.split('.')[0]
            lines_to_remove[file_name] = 0
            total_lines_prior[file_name] = 0
            with open(file_path) as f:
                # skip header
                next(f)
                for line in f:
                    line = line.strip()
                    if line != '':
                        if (table_name in rd.PID_IN_COL1 and int(line.split(",")[0]) in self.skip_pids) or \
                                (table_name in rd.PID_IN_COL2 and int(line.split(",")[1]) in self.skip_pids):
                            lines_to_remove[file_name] += 1
                        total_lines_prior[file_name] += 1

            # write file to cloud for testing
            test_util.write_cloud_file(self.bucket,
                                       file_path,
                                       prefix=self.folder_prefix_1)
            test_util.write_cloud_file(self.bucket,
                                       file_path,
                                       prefix=self.folder_prefix_2)

        retract_result = rd.run_gcs_retraction(self.project_id,
                                               self.sandbox_dataset_id,
                                               self.pid_table_id,
                                               self.hpo_id,
                                               folder='all_folders',
                                               force_flag=True)

        total_lines_post = {}
        for file_path in test_util.FIVE_PERSONS_FILES:
            file_name = file_path.split('/')[-1]
            actual_result_contents = test_util.read_cloud_file(
                self.bucket, self.folder_prefix_1 + file_name)
            # convert to list and remove header and last list item since it is a newline
            total_lines_post[file_name] = len(
                actual_result_contents.split('\n')[1:-1])

        for key in total_lines_prior.keys():
            if key in lines_to_remove:
                self.assertEqual(
                    lines_to_remove[key],
                    total_lines_prior[key] - total_lines_post[key])
            else:
                self.assertEqual(total_lines_prior[key], total_lines_post[key])

        # metadata for each updated file is returned
        lines_to_remove = {
            key: lines_to_remove[key]
            for key in lines_to_remove if lines_to_remove[key] > 0
        }
        self.assertEqual(len(retract_result[self.folder_prefix_1]),
                         len(lines_to_remove))