def main(raw_args=None): """ Truncate and store fitbit data. Assumes you are passing arguments either via command line or a list. """ parser = get_fitbit_parser() args, kwargs = clean_cdr.fetch_args_kwargs(parser, raw_args) pipeline_logging.configure(level=logging.INFO, add_console_handler=args.console_log) # Identify the cleaning classes being run for specified data_stage # and validate if all the required arguments are supplied cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[consts.FITBIT] clean_cdr.validate_custom_params(cleaning_classes, **kwargs) # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.project_id, credentials=impersonation_creds) # create staging, sandbox, backup and clean datasets with descriptions and labels fitbit_datasets = create_fitbit_datasets(client, args.release_tag) copy_fitbit_tables_from_views(client, args.fitbit_dataset, fitbit_datasets[consts.BACKUP], table_prefix='v_') bq.copy_datasets(client, fitbit_datasets[consts.BACKUP], fitbit_datasets[consts.STAGING]) common_cleaning_args = [ '-p', args.project_id, '-d', fitbit_datasets[consts.STAGING], '-b', fitbit_datasets[consts.SANDBOX], '-s', '-a', consts.FITBIT ] fitbit_cleaning_args = args_parser.add_kwargs_to_args( common_cleaning_args, kwargs) clean_cdr.main(args=fitbit_cleaning_args) # Snapshot the staging dataset to final dataset bq.build_and_copy_contents(client, fitbit_datasets[consts.STAGING], fitbit_datasets[consts.CLEAN])
def main(raw_args=None): """ Clean an RDR import. Assumes you are passing arguments either via command line or a list. """ args, kwargs = parse_rdr_args(raw_args) pipeline_logging.configure(level=logging.INFO, add_console_handler=args.console_log) # specific check on truncation_date. It should not cause a failure if it is not set. if not args.truncation_date: LOGGER.info('truncation_date is unset. It will default to the current ' 'date in the truncation cleaning rule.') # validate we've got all required data before continuing cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING.get('rdr') clean_cdr.validate_custom_params(cleaning_classes, **kwargs) # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.curation_project_id, credentials=impersonation_creds) # create staging, sandbox, and clean datasets with descriptions and labels datasets = create_datasets(client, args.rdr_dataset, args.release_tag) # copy raw data into staging dataset copy_raw_rdr_tables(client, args.rdr_dataset, datasets.get('staging')) # clean the RDR staging dataset cleaning_args = [ '-p', args.curation_project_id, '-d', datasets.get('staging', 'UNSET'), '-b', datasets.get('sandbox', 'UNSET'), '--data_stage', 'rdr', '--truncation_date', args.truncation_date, '--export_date', args.export_date ] all_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs) clean_cdr.main(args=all_cleaning_args) bq.build_and_copy_contents(client, datasets.get('staging', 'UNSET'), datasets.get('clean', 'UNSET')) # update sandbox description and labels sandbox_dataset = client.get_dataset(datasets.get( 'sandbox', 'UNSET')) # Make an API request. sandbox_dataset.description = ( f'Sandbox created for storing records affected by the cleaning ' f'rules applied to {datasets.get("clean")}') sandbox_dataset.labels['phase'] = 'sandbox' sandbox_dataset = client.update_dataset( sandbox_dataset, ["description"]) # Make an API request. full_dataset_id = f'{sandbox_dataset.project}.{sandbox_dataset.dataset_id}' LOGGER.info( f'Updated dataset `{full_dataset_id}` with description `{sandbox_dataset.description}`' ) LOGGER.info(f'RDR snapshot and cleaning, ' f'`{client.project}.{datasets.get("clean")}`, is complete.')
def test_clean_cdr(self, mock_fetch_args, mock_validate_args, mock_get_query_list, mock_clean_dataset): from argparse import Namespace # Test clean_dataset() function call args = [ '-p', self.project_id, '-d', self.dataset_id, '-b', self.sandbox_dataset_id, '--data_stage', 'ehr' ] # creates argparse namespace return value expected_args = Namespace( **{ 'project_id': self.project_id, 'dataset_id': self.dataset_id, 'sandbox_dataset_id': self.sandbox_dataset_id, 'data_stage': DataStage.EHR, 'console_log': False, 'list_queries': False }) expected_kargs = {} mock_fetch_args.return_value = expected_args, expected_kargs rules = cc.DATA_STAGE_RULES_MAPPING['ehr'] cc.main(args) mock_validate_args.assert_called_once_with(rules, **expected_kargs) mock_clean_dataset.assert_called_once_with( project_id=self.project_id, dataset_id=self.dataset_id, sandbox_dataset_id=self.sandbox_dataset_id, rules=rules, table_namer=DataStage.EHR.value) # Test get_queries() function call args = [ '-p', self.project_id, '-d', self.dataset_id, '-b', self.sandbox_dataset_id, '--data_stage', 'ehr', '--list_queries', True ] expected_args = Namespace( **{ 'project_id': self.project_id, 'dataset_id': self.dataset_id, 'sandbox_dataset_id': self.sandbox_dataset_id, 'data_stage': DataStage.EHR, 'console_log': False, 'list_queries': True }) expected_kargs = {} mock_fetch_args.return_value = expected_args, expected_kargs cc.main(args) mock_get_query_list.assert_called_once_with( project_id=self.project_id, dataset_id=self.dataset_id, sandbox_dataset_id=self.sandbox_dataset_id, rules=rules, table_namer=DataStage.EHR.value)
def create_tier(credentials_filepath, project_id, tier, input_dataset, release_tag, deid_stage, run_as, **kwargs): """ This function is the main entry point for the deid process. It passes the required parameters to the implementing functions. :param credentials_filepath: filepath to credentials to access GCP :param project_id: project_id associated with the input dataset :param tier: controlled or registered tier intended for the output dataset :param input_dataset: name of the input dataset :param release_tag: release tag for dataset in the format of YYYYq#r# :param deid_stage: deid stage (deid, base or clean) :param run_as: email address of the service account to impersonate :return: name of created controlled or registered dataset """ # validation of params validate_create_tier_args(tier, deid_stage, release_tag) # today's date for QA handoff qa_handoff_date = datetime.strftime(datetime.now(), '%Y-%m-%d') # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( run_as, SCOPES, credentials_filepath) client = bq.get_client(project_id, credentials=impersonation_creds) # Get Final Dataset name final_dataset_name = get_dataset_name(tier, release_tag, deid_stage) # Create intermediary datasets and copy tables from input dataset to newly created dataset datasets = create_datasets(client, final_dataset_name, input_dataset, tier, release_tag) bq.copy_datasets(client, input_dataset, datasets[consts.STAGING]) # Run cleaning rules cleaning_args = [ '-p', project_id, '-d', datasets[consts.STAGING], '-b', datasets[consts.SANDBOX], '--data_stage', f'{tier}_tier_{deid_stage}' ] # Will update the qa_handoff_date to current date if 'base' in deid_stage: versions = add_cdr_metadata.get_etl_version(datasets[consts.STAGING], project_id) if not versions: raise RuntimeError( 'etl version does not exist, make sure _cdr_metadata table was created in combined step' ) add_cdr_metadata.main([ '--component', add_cdr_metadata.INSERT, '--project_id', project_id, '--target_dataset', datasets[consts.STAGING], '--qa_handoff_date', qa_handoff_date, '--etl_version', versions[0] ]) else: LOGGER.info( f'deid_stage was not base, no data inserted into _cdr_metadata table' ) controlled_tier_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs) clean_cdr.main(args=controlled_tier_cleaning_args) # Snapshot the staging dataset to final dataset create_schemaed_snapshot_dataset(project_id, datasets[consts.STAGING], final_dataset_name, False) return datasets