def main(raw_args=None): """ Run a full RDR import. Assumes you are passing arguments either via command line or a list. """ args = parse_rdr_args(raw_args) pipeline_logging.configure(level=logging.INFO, add_console_handler=args.console_log) description = f'RDR DUMP loaded from {args.bucket} dated {args.export_date}' export_date = args.export_date.replace('-', '') new_dataset_name = f'rdr{export_date}' # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.curation_project_id, credentials=impersonation_creds) dataset_object = bq.define_dataset(client.project, new_dataset_name, description, {'export_date': args.export_date}) client.create_dataset(dataset_object) create_rdr_tables(client, new_dataset_name, args.bucket) copy_vocab_tables(client, new_dataset_name, args.vocabulary)
def main(args=None): pipeline_logging.configure(logging.DEBUG, add_console_handler=True) args = parse_args(args) query_list = create_queries(args.project_id, args.ticket_number, args.pids_project_id, args.pids_dataset_id, args.pids_table, args.dataset_list) client = bq.get_client(args.project_id) run_queries(query_list, client) LOGGER.info("Retraction complete")
def main(args=None): pipeline_logging.configure(logging.DEBUG, add_console_handler=True) parser = get_parser() args = parser.parse_args(args) client = bq.get_client(args.project_id) dataset_ids = ru.get_datasets_list(args.project_id, args.dataset_ids) LOGGER.info( f"Datasets to retract deactivated participants from: {dataset_ids}") run_deactivation(client, args.project_id, dataset_ids, args.fq_deact_table, args.fq_pid_rid_table) LOGGER.info( f"Retraction of deactivated participants from {dataset_ids} complete")
def main(raw_args=None): args = get_arguments(raw_args) if not args.console_log: print(f'===============================================\n' f'Warning!! By not logging to the console you \n' f'may miss important information!\n' f'===============================================\n') pipeline_logging.configure(add_console_handler=args.console_log) run_deletion(args.project_id, args.name_substrings)
def main(first_n): pipeline_logging.configure(logging.INFO, add_console_handler=True) project_id = app_identity.get_application_id() sc = StorageClient(project_id) _check_project(sc) buckets_to_delete = _filter_stale_buckets(sc, first_n) for stale_bucket in buckets_to_delete: LOGGER.info(f"Running - sc.get_bucket({stale_bucket}).delete()") sc.get_bucket(stale_bucket).delete() return buckets_to_delete
def main(raw_args=None): # Parses the required arguments and keyword arguments required by cleaning rules args, kwargs = parse_deid_args(raw_args) # Sets logging level pipeline_logging.configure(level=logging.DEBUG, add_console_handler=args.console_log) # Identify the cleaning classes being run for specified data_stage # and validate if all the required arguments are supplied cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[ f'{args.tier}_tier_{args.deid_stage}'] clean_cdr.validate_custom_params(cleaning_classes, **kwargs) # Runs create_tier in order to generate the {args.tier}_tier_{args.data_stage} datasets and apply cleaning rules datasets = create_tier(args.credentials_filepath, args.project_id, args.tier, args.idataset, args.release_tag, args.deid_stage, args.target_principal, **kwargs) return datasets
def main(): parser = get_arg_parser() args = parser.parse_args() #Set up pipeline logging pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True) # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.project_id, credentials=impersonation_creds) # Populates the validation table for the site identify_rdr_ehr_match(client, args.project_id, args.hpo_id, EHR_OPS) LOGGER.info('Done.')
def main(raw_args=None): """ Truncate and store fitbit data. Assumes you are passing arguments either via command line or a list. """ parser = get_fitbit_parser() args, kwargs = clean_cdr.fetch_args_kwargs(parser, raw_args) pipeline_logging.configure(level=logging.INFO, add_console_handler=args.console_log) # Identify the cleaning classes being run for specified data_stage # and validate if all the required arguments are supplied cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[consts.FITBIT] clean_cdr.validate_custom_params(cleaning_classes, **kwargs) # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.project_id, credentials=impersonation_creds) # create staging, sandbox, backup and clean datasets with descriptions and labels fitbit_datasets = create_fitbit_datasets(client, args.release_tag) copy_fitbit_tables_from_views(client, args.fitbit_dataset, fitbit_datasets[consts.BACKUP], table_prefix='v_') bq.copy_datasets(client, fitbit_datasets[consts.BACKUP], fitbit_datasets[consts.STAGING]) common_cleaning_args = [ '-p', args.project_id, '-d', fitbit_datasets[consts.STAGING], '-b', fitbit_datasets[consts.SANDBOX], '-s', '-a', consts.FITBIT ] fitbit_cleaning_args = args_parser.add_kwargs_to_args( common_cleaning_args, kwargs) clean_cdr.main(args=fitbit_cleaning_args) # Snapshot the staging dataset to final dataset bq.build_and_copy_contents(client, fitbit_datasets[consts.STAGING], fitbit_datasets[consts.CLEAN])
def main(first_n): pipeline_logging.configure(logging.INFO, add_console_handler=True) bq_client = bq.get_client(os.environ.get('GOOGLE_CLOUD_PROJECT')) _check_project(bq_client) datasets_to_delete = _filter_stale_datasets(bq_client, first_n) for stale_dataset in datasets_to_delete: LOGGER.info(f"Running - bq_client.delete_dataset({stale_dataset})") try: bq_client.delete_dataset(stale_dataset) except exceptions.BadRequest as e: LOGGER.warning( f"Failed to delete {stale_dataset}. Message: {e.message}") return datasets_to_delete
def test_configure(self, mock_open): """ Verify that root level and handlers are properly set after configure :param mock_open: mock to prevent side effect of opening file """ # names are used to uniquely identify handlers both in standard logging module # and in this test case expected_hdlrs = [pl._FILE_HANDLER] pl.configure() # root level is set to default (i.e. INFO) self.assertEqual(logging.root.level, pl.DEFAULT_LOG_LEVEL) # handlers are added actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers] self.assertEqual(expected_hdlrs, actual_hdlrs) # no duplicate handlers after additional calls to configure pl.configure() self.assertEqual(len(expected_hdlrs), len(logging.root.handlers)) actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers] self.assertEqual(expected_hdlrs, actual_hdlrs) # add console log handler to configuration pl.configure(add_console_handler=True) actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers] expected_hdlrs = [pl._FILE_HANDLER, pl._CONSOLE_HANDLER] self.assertEqual(expected_hdlrs, actual_hdlrs)
def assert_sane_configure(self): """ verifies that a utils.pipeline_logger.configure() call executed successfully TODO: pass in expected basename of log file? bit more flexible explicit, but may not worth the refactor. """ # names are used to uniquely identify handlers both in standard logging module # and in this test case expected_hdlrs = [pl._FILE_HANDLER] # execute configuration pl.configure() # root level is set to default (i.e. INFO) self.assertEqual(logging.root.level, pl.DEFAULT_LOG_LEVEL) # handlers are added actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers] self.assertEqual(expected_hdlrs, actual_hdlrs) # no duplicate handlers after additional calls to configure pl.configure() self.assertEqual(len(expected_hdlrs), len(logging.root.handlers)) actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers] self.assertEqual(expected_hdlrs, actual_hdlrs) for hdlr in logging.root.handlers: if isinstance(hdlr, logging.FileHandler): self.assert_correct_log_filename(hdlr, pl.get_log_filename()) # add console log handler to configuration pl.configure(add_console_handler=True) actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers] expected_hdlrs = [pl._FILE_HANDLER, pl._CONSOLE_HANDLER] self.assertEqual(expected_hdlrs, actual_hdlrs)
'Identifies the target dataset where the vocabulary is to be loaded', required=False) return argument_parser def get_release_date(release_date: datetime.date = None) -> str: """ Get the name of a vocabulary release based on date :param release_date: date the vocabulary is released :return: name of vocabulary release """ if not release_date: release_date = datetime.date.today() release_date_str = release_date.strftime("%Y%m%d") return release_date_str def get_target_dataset_id(release_tag: str) -> str: return f'vocabulary{release_tag}' if __name__ == '__main__': ARGS = get_arg_parser().parse_args() RELEASE_TAG = ARGS.release_date or get_release_date() TARGET_DATASET_ID = ARGS.target_dataset_id or get_target_dataset_id( RELEASE_TAG) pipeline_logging.configure(add_console_handler=True) main(ARGS.project_id, ARGS.bucket_name, ARGS.vocab_folder_path, TARGET_DATASET_ID)
def main(raw_args=None): """ Clean an RDR import. Assumes you are passing arguments either via command line or a list. """ args, kwargs = parse_rdr_args(raw_args) pipeline_logging.configure(level=logging.INFO, add_console_handler=args.console_log) # specific check on truncation_date. It should not cause a failure if it is not set. if not args.truncation_date: LOGGER.info('truncation_date is unset. It will default to the current ' 'date in the truncation cleaning rule.') # validate we've got all required data before continuing cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING.get('rdr') clean_cdr.validate_custom_params(cleaning_classes, **kwargs) # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.curation_project_id, credentials=impersonation_creds) # create staging, sandbox, and clean datasets with descriptions and labels datasets = create_datasets(client, args.rdr_dataset, args.release_tag) # copy raw data into staging dataset copy_raw_rdr_tables(client, args.rdr_dataset, datasets.get('staging')) # clean the RDR staging dataset cleaning_args = [ '-p', args.curation_project_id, '-d', datasets.get('staging', 'UNSET'), '-b', datasets.get('sandbox', 'UNSET'), '--data_stage', 'rdr', '--truncation_date', args.truncation_date, '--export_date', args.export_date ] all_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs) clean_cdr.main(args=all_cleaning_args) bq.build_and_copy_contents(client, datasets.get('staging', 'UNSET'), datasets.get('clean', 'UNSET')) # update sandbox description and labels sandbox_dataset = client.get_dataset(datasets.get( 'sandbox', 'UNSET')) # Make an API request. sandbox_dataset.description = ( f'Sandbox created for storing records affected by the cleaning ' f'rules applied to {datasets.get("clean")}') sandbox_dataset.labels['phase'] = 'sandbox' sandbox_dataset = client.update_dataset( sandbox_dataset, ["description"]) # Make an API request. full_dataset_id = f'{sandbox_dataset.project}.{sandbox_dataset.dataset_id}' LOGGER.info( f'Updated dataset `{full_dataset_id}` with description `{sandbox_dataset.description}`' ) LOGGER.info(f'RDR snapshot and cleaning, ' f'`{client.project}.{datasets.get("clean")}`, is complete.')
def test_default_level(self, mock_open, mock_file_emit, mock_stream_emit): pl.configure(add_console_handler=True) self.assert_logs_handled(pl.DEFAULT_LOG_LEVEL, mock_file_emit, mock_stream_emit)
description='Add new mappings to our primary pid/rid mapping table.', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '-r', '--fq_rdr_mapping', action='store', dest='rdr_mapping', help=('The fully qualified rdr mapping table name. ' 'The project_id will be extracted from this table name.'), type=check_table_name, required=True) parser.add_argument( '-i', '--run_as', action='store', dest='run_as', help=('The email address of the service account to impersonate.'), type=check_email_address) args = parser.parse_args() store_to_primary_mapping_table(args.rdr_mapping, run_as=args.run_as) LOGGER.info("Finished pid/rid/shift storage process.") if __name__ == '__main__': from utils import pipeline_logging pipeline_logging.configure() process_mappings()
def main(raw_args=None): args = parse_deid_args(raw_args) pipeline_logging.configure(level=logging.DEBUG, add_console_handler=args.console_log) create_tier(args.credentials_filepath, args.project_id, args.tier, args.idataset, args.release_tag, args.deid_stage)
1333234, 1310066, 715725, 1310147, 702686, 1310054, 715726, 715724, 715714, 1310146, 1310058 ] def setup_validation(self, client, *args, **keyword_args): pass def validate_rule(self, client, *args, **keyword_args): pass if __name__ == '__main__': import cdr_cleaner.args_parser as parser import cdr_cleaner.clean_cdr_engine as clean_engine ARGS = parser.default_parse_args() pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True) if ARGS.list_queries: clean_engine.add_console_logging() query_list = clean_engine.get_query_list( ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id, [(CopeSurveyResponseSuppression, )]) for query in query_list: LOGGER.info(query) else: clean_engine.add_console_logging(ARGS.console_log) clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id, [(CopeSurveyResponseSuppression, )])
def test_specific_level(self, mock_open, mock_file_emit, mock_stream_emit): pl.configure(logging.CRITICAL, add_console_handler=True) self.assert_logs_handled(logging.CRITICAL, mock_file_emit, mock_stream_emit)