def main(raw_args=None):
    """
    Run a full RDR import.

    Assumes you are passing arguments either via command line or a
    list.
    """
    args = parse_rdr_args(raw_args)

    pipeline_logging.configure(level=logging.INFO,
                               add_console_handler=args.console_log)

    description = f'RDR DUMP loaded from {args.bucket} dated {args.export_date}'
    export_date = args.export_date.replace('-', '')
    new_dataset_name = f'rdr{export_date}'

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.curation_project_id,
                           credentials=impersonation_creds)

    dataset_object = bq.define_dataset(client.project, new_dataset_name,
                                       description,
                                       {'export_date': args.export_date})
    client.create_dataset(dataset_object)

    create_rdr_tables(client, new_dataset_name, args.bucket)
    copy_vocab_tables(client, new_dataset_name, args.vocabulary)
Exemple #2
0
def main(args=None):
    pipeline_logging.configure(logging.DEBUG, add_console_handler=True)
    args = parse_args(args)
    query_list = create_queries(args.project_id, args.ticket_number,
                                args.pids_project_id, args.pids_dataset_id,
                                args.pids_table, args.dataset_list)
    client = bq.get_client(args.project_id)
    run_queries(query_list, client)
    LOGGER.info("Retraction complete")
Exemple #3
0
def main(args=None):
    pipeline_logging.configure(logging.DEBUG, add_console_handler=True)
    parser = get_parser()
    args = parser.parse_args(args)
    client = bq.get_client(args.project_id)
    dataset_ids = ru.get_datasets_list(args.project_id, args.dataset_ids)
    LOGGER.info(
        f"Datasets to retract deactivated participants from: {dataset_ids}")
    run_deactivation(client, args.project_id, dataset_ids, args.fq_deact_table,
                     args.fq_pid_rid_table)
    LOGGER.info(
        f"Retraction of deactivated participants from {dataset_ids} complete")
Exemple #4
0
def main(raw_args=None):
    args = get_arguments(raw_args)

    if not args.console_log:
        print(f'===============================================\n'
              f'Warning!!  By not logging to the console you \n'
              f'may miss important information!\n'
              f'===============================================\n')

    pipeline_logging.configure(add_console_handler=args.console_log)

    run_deletion(args.project_id, args.name_substrings)
def main(first_n):

    pipeline_logging.configure(logging.INFO, add_console_handler=True)

    project_id = app_identity.get_application_id()
    sc = StorageClient(project_id)

    _check_project(sc)

    buckets_to_delete = _filter_stale_buckets(sc, first_n)

    for stale_bucket in buckets_to_delete:
        LOGGER.info(f"Running - sc.get_bucket({stale_bucket}).delete()")
        sc.get_bucket(stale_bucket).delete()

    return buckets_to_delete
Exemple #6
0
def main(raw_args=None):
    # Parses the required arguments and keyword arguments required by cleaning rules
    args, kwargs = parse_deid_args(raw_args)
    # Sets logging level
    pipeline_logging.configure(level=logging.DEBUG,
                               add_console_handler=args.console_log)
    # Identify the cleaning classes being run for specified data_stage
    # and validate if all the required arguments are supplied
    cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[
        f'{args.tier}_tier_{args.deid_stage}']
    clean_cdr.validate_custom_params(cleaning_classes, **kwargs)

    # Runs create_tier in order to generate the {args.tier}_tier_{args.data_stage} datasets and apply cleaning rules
    datasets = create_tier(args.credentials_filepath, args.project_id,
                           args.tier, args.idataset, args.release_tag,
                           args.deid_stage, args.target_principal, **kwargs)
    return datasets
Exemple #7
0
def main():
    parser = get_arg_parser()
    args = parser.parse_args()

    #Set up pipeline logging
    pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True)

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.project_id, credentials=impersonation_creds)

    # Populates the validation table for the site
    identify_rdr_ehr_match(client, args.project_id, args.hpo_id, EHR_OPS)

    LOGGER.info('Done.')
Exemple #8
0
def main(raw_args=None):
    """
    Truncate and store fitbit data.

    Assumes you are passing arguments either via command line or a
    list.
    """
    parser = get_fitbit_parser()
    args, kwargs = clean_cdr.fetch_args_kwargs(parser, raw_args)

    pipeline_logging.configure(level=logging.INFO,
                               add_console_handler=args.console_log)

    # Identify the cleaning classes being run for specified data_stage
    # and validate if all the required arguments are supplied
    cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[consts.FITBIT]
    clean_cdr.validate_custom_params(cleaning_classes, **kwargs)

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.project_id, credentials=impersonation_creds)

    # create staging, sandbox, backup and clean datasets with descriptions and labels
    fitbit_datasets = create_fitbit_datasets(client, args.release_tag)

    copy_fitbit_tables_from_views(client,
                                  args.fitbit_dataset,
                                  fitbit_datasets[consts.BACKUP],
                                  table_prefix='v_')
    bq.copy_datasets(client, fitbit_datasets[consts.BACKUP],
                     fitbit_datasets[consts.STAGING])

    common_cleaning_args = [
        '-p', args.project_id, '-d', fitbit_datasets[consts.STAGING], '-b',
        fitbit_datasets[consts.SANDBOX], '-s', '-a', consts.FITBIT
    ]
    fitbit_cleaning_args = args_parser.add_kwargs_to_args(
        common_cleaning_args, kwargs)

    clean_cdr.main(args=fitbit_cleaning_args)

    # Snapshot the staging dataset to final dataset
    bq.build_and_copy_contents(client, fitbit_datasets[consts.STAGING],
                               fitbit_datasets[consts.CLEAN])
def main(first_n):

    pipeline_logging.configure(logging.INFO, add_console_handler=True)

    bq_client = bq.get_client(os.environ.get('GOOGLE_CLOUD_PROJECT'))

    _check_project(bq_client)

    datasets_to_delete = _filter_stale_datasets(bq_client, first_n)

    for stale_dataset in datasets_to_delete:

        LOGGER.info(f"Running - bq_client.delete_dataset({stale_dataset})")

        try:
            bq_client.delete_dataset(stale_dataset)
        except exceptions.BadRequest as e:
            LOGGER.warning(
                f"Failed to delete {stale_dataset}. Message: {e.message}")

    return datasets_to_delete
Exemple #10
0
    def test_configure(self, mock_open):
        """
        Verify that root level and handlers are properly set after configure
        :param mock_open: mock to prevent side effect of opening file
        """
        # names are used to uniquely identify handlers both in standard logging module
        # and in this test case
        expected_hdlrs = [pl._FILE_HANDLER]

        pl.configure()
        # root level is set to default (i.e. INFO)
        self.assertEqual(logging.root.level, pl.DEFAULT_LOG_LEVEL)

        # handlers are added
        actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers]
        self.assertEqual(expected_hdlrs, actual_hdlrs)

        # no duplicate handlers after additional calls to configure
        pl.configure()
        self.assertEqual(len(expected_hdlrs), len(logging.root.handlers))
        actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers]
        self.assertEqual(expected_hdlrs, actual_hdlrs)

        # add console log handler to configuration
        pl.configure(add_console_handler=True)
        actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers]
        expected_hdlrs = [pl._FILE_HANDLER, pl._CONSOLE_HANDLER]
        self.assertEqual(expected_hdlrs, actual_hdlrs)
Exemple #11
0
    def assert_sane_configure(self):
        """
        verifies that a utils.pipeline_logger.configure() call executed successfully

        TODO: pass in expected basename of log file? bit more flexible explicit, but may not worth the refactor.
        """
        # names are used to uniquely identify handlers both in standard logging module
        # and in this test case
        expected_hdlrs = [pl._FILE_HANDLER]

        # execute configuration
        pl.configure()
        # root level is set to default (i.e. INFO)
        self.assertEqual(logging.root.level, pl.DEFAULT_LOG_LEVEL)

        # handlers are added
        actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers]
        self.assertEqual(expected_hdlrs, actual_hdlrs)

        # no duplicate handlers after additional calls to configure
        pl.configure()
        self.assertEqual(len(expected_hdlrs), len(logging.root.handlers))
        actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers]
        self.assertEqual(expected_hdlrs, actual_hdlrs)

        for hdlr in logging.root.handlers:
            if isinstance(hdlr, logging.FileHandler):
                self.assert_correct_log_filename(hdlr, pl.get_log_filename())

        # add console log handler to configuration
        pl.configure(add_console_handler=True)
        actual_hdlrs = [hdlr.name for hdlr in logging.root.handlers]
        expected_hdlrs = [pl._FILE_HANDLER, pl._CONSOLE_HANDLER]
        self.assertEqual(expected_hdlrs, actual_hdlrs)
Exemple #12
0
        'Identifies the target dataset where the vocabulary is to be loaded',
        required=False)
    return argument_parser


def get_release_date(release_date: datetime.date = None) -> str:
    """
    Get the name of a vocabulary release based on date

    :param release_date: date the vocabulary is released
    :return: name of vocabulary release
    """
    if not release_date:
        release_date = datetime.date.today()
    release_date_str = release_date.strftime("%Y%m%d")
    return release_date_str


def get_target_dataset_id(release_tag: str) -> str:
    return f'vocabulary{release_tag}'


if __name__ == '__main__':
    ARGS = get_arg_parser().parse_args()
    RELEASE_TAG = ARGS.release_date or get_release_date()
    TARGET_DATASET_ID = ARGS.target_dataset_id or get_target_dataset_id(
        RELEASE_TAG)
    pipeline_logging.configure(add_console_handler=True)
    main(ARGS.project_id, ARGS.bucket_name, ARGS.vocab_folder_path,
         TARGET_DATASET_ID)
Exemple #13
0
def main(raw_args=None):
    """
    Clean an RDR import.

    Assumes you are passing arguments either via command line or a
    list.
    """
    args, kwargs = parse_rdr_args(raw_args)

    pipeline_logging.configure(level=logging.INFO,
                               add_console_handler=args.console_log)

    # specific check on truncation_date. It should not cause a failure if it is not set.
    if not args.truncation_date:
        LOGGER.info('truncation_date is unset.  It will default to the current '
                    'date in the truncation cleaning rule.')

    # validate we've got all required data before continuing
    cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING.get('rdr')
    clean_cdr.validate_custom_params(cleaning_classes, **kwargs)

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.curation_project_id,
                           credentials=impersonation_creds)

    # create staging, sandbox, and clean datasets with descriptions and labels
    datasets = create_datasets(client, args.rdr_dataset, args.release_tag)

    # copy raw data into staging dataset
    copy_raw_rdr_tables(client, args.rdr_dataset, datasets.get('staging'))

    # clean the RDR staging dataset
    cleaning_args = [
        '-p', args.curation_project_id, '-d',
        datasets.get('staging', 'UNSET'), '-b',
        datasets.get('sandbox',
                     'UNSET'), '--data_stage', 'rdr', '--truncation_date',
        args.truncation_date, '--export_date', args.export_date
    ]

    all_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs)
    clean_cdr.main(args=all_cleaning_args)

    bq.build_and_copy_contents(client, datasets.get('staging', 'UNSET'),
                               datasets.get('clean', 'UNSET'))

    # update sandbox description and labels
    sandbox_dataset = client.get_dataset(datasets.get(
        'sandbox', 'UNSET'))  # Make an API request.
    sandbox_dataset.description = (
        f'Sandbox created for storing records affected by the cleaning '
        f'rules applied to {datasets.get("clean")}')
    sandbox_dataset.labels['phase'] = 'sandbox'
    sandbox_dataset = client.update_dataset(
        sandbox_dataset, ["description"])  # Make an API request.

    full_dataset_id = f'{sandbox_dataset.project}.{sandbox_dataset.dataset_id}'
    LOGGER.info(
        f'Updated dataset `{full_dataset_id}` with description `{sandbox_dataset.description}`'
    )

    LOGGER.info(f'RDR snapshot and cleaning, '
                f'`{client.project}.{datasets.get("clean")}`, is complete.')
Exemple #14
0
 def test_default_level(self, mock_open, mock_file_emit, mock_stream_emit):
     pl.configure(add_console_handler=True)
     self.assert_logs_handled(pl.DEFAULT_LOG_LEVEL, mock_file_emit,
                              mock_stream_emit)
Exemple #15
0
        description='Add new mappings to our primary pid/rid mapping table.',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '-r',
        '--fq_rdr_mapping',
        action='store',
        dest='rdr_mapping',
        help=('The fully qualified rdr mapping table name.  '
              'The project_id will be extracted from this table name.'),
        type=check_table_name,
        required=True)
    parser.add_argument(
        '-i',
        '--run_as',
        action='store',
        dest='run_as',
        help=('The email address of the service account to impersonate.'),
        type=check_email_address)
    args = parser.parse_args()

    store_to_primary_mapping_table(args.rdr_mapping, run_as=args.run_as)

    LOGGER.info("Finished pid/rid/shift storage process.")


if __name__ == '__main__':
    from utils import pipeline_logging

    pipeline_logging.configure()
    process_mappings()
Exemple #16
0
def main(raw_args=None):
    args = parse_deid_args(raw_args)
    pipeline_logging.configure(level=logging.DEBUG,
                               add_console_handler=args.console_log)
    create_tier(args.credentials_filepath, args.project_id, args.tier,
                args.idataset, args.release_tag, args.deid_stage)
            1333234, 1310066, 715725, 1310147, 702686, 1310054, 715726, 715724,
            715714, 1310146, 1310058
        ]

    def setup_validation(self, client, *args, **keyword_args):
        pass

    def validate_rule(self, client, *args, **keyword_args):
        pass


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.default_parse_args()
    pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True)

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(CopeSurveyResponseSuppression, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(CopeSurveyResponseSuppression, )])
Exemple #18
0
 def test_specific_level(self, mock_open, mock_file_emit, mock_stream_emit):
     pl.configure(logging.CRITICAL, add_console_handler=True)
     self.assert_logs_handled(logging.CRITICAL, mock_file_emit,
                              mock_stream_emit)