Esempio n. 1
0
def main(raw_args=None):
    """
    Truncate and store fitbit data.

    Assumes you are passing arguments either via command line or a
    list.
    """
    parser = get_fitbit_parser()
    args, kwargs = clean_cdr.fetch_args_kwargs(parser, raw_args)

    pipeline_logging.configure(level=logging.INFO,
                               add_console_handler=args.console_log)

    # Identify the cleaning classes being run for specified data_stage
    # and validate if all the required arguments are supplied
    cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[consts.FITBIT]
    clean_cdr.validate_custom_params(cleaning_classes, **kwargs)

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.project_id, credentials=impersonation_creds)

    # create staging, sandbox, backup and clean datasets with descriptions and labels
    fitbit_datasets = create_fitbit_datasets(client, args.release_tag)

    copy_fitbit_tables_from_views(client,
                                  args.fitbit_dataset,
                                  fitbit_datasets[consts.BACKUP],
                                  table_prefix='v_')
    bq.copy_datasets(client, fitbit_datasets[consts.BACKUP],
                     fitbit_datasets[consts.STAGING])

    common_cleaning_args = [
        '-p', args.project_id, '-d', fitbit_datasets[consts.STAGING], '-b',
        fitbit_datasets[consts.SANDBOX], '-s', '-a', consts.FITBIT
    ]
    fitbit_cleaning_args = args_parser.add_kwargs_to_args(
        common_cleaning_args, kwargs)

    clean_cdr.main(args=fitbit_cleaning_args)

    # Snapshot the staging dataset to final dataset
    bq.build_and_copy_contents(client, fitbit_datasets[consts.STAGING],
                               fitbit_datasets[consts.CLEAN])
Esempio n. 2
0
def main(raw_args=None):
    """
    Clean an RDR import.

    Assumes you are passing arguments either via command line or a
    list.
    """
    args, kwargs = parse_rdr_args(raw_args)

    pipeline_logging.configure(level=logging.INFO,
                               add_console_handler=args.console_log)

    # specific check on truncation_date. It should not cause a failure if it is not set.
    if not args.truncation_date:
        LOGGER.info('truncation_date is unset.  It will default to the current '
                    'date in the truncation cleaning rule.')

    # validate we've got all required data before continuing
    cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING.get('rdr')
    clean_cdr.validate_custom_params(cleaning_classes, **kwargs)

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.curation_project_id,
                           credentials=impersonation_creds)

    # create staging, sandbox, and clean datasets with descriptions and labels
    datasets = create_datasets(client, args.rdr_dataset, args.release_tag)

    # copy raw data into staging dataset
    copy_raw_rdr_tables(client, args.rdr_dataset, datasets.get('staging'))

    # clean the RDR staging dataset
    cleaning_args = [
        '-p', args.curation_project_id, '-d',
        datasets.get('staging', 'UNSET'), '-b',
        datasets.get('sandbox',
                     'UNSET'), '--data_stage', 'rdr', '--truncation_date',
        args.truncation_date, '--export_date', args.export_date
    ]

    all_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs)
    clean_cdr.main(args=all_cleaning_args)

    bq.build_and_copy_contents(client, datasets.get('staging', 'UNSET'),
                               datasets.get('clean', 'UNSET'))

    # update sandbox description and labels
    sandbox_dataset = client.get_dataset(datasets.get(
        'sandbox', 'UNSET'))  # Make an API request.
    sandbox_dataset.description = (
        f'Sandbox created for storing records affected by the cleaning '
        f'rules applied to {datasets.get("clean")}')
    sandbox_dataset.labels['phase'] = 'sandbox'
    sandbox_dataset = client.update_dataset(
        sandbox_dataset, ["description"])  # Make an API request.

    full_dataset_id = f'{sandbox_dataset.project}.{sandbox_dataset.dataset_id}'
    LOGGER.info(
        f'Updated dataset `{full_dataset_id}` with description `{sandbox_dataset.description}`'
    )

    LOGGER.info(f'RDR snapshot and cleaning, '
                f'`{client.project}.{datasets.get("clean")}`, is complete.')
Esempio n. 3
0
    def test_clean_cdr(self, mock_fetch_args, mock_validate_args,
                       mock_get_query_list, mock_clean_dataset):

        from argparse import Namespace

        # Test clean_dataset() function call
        args = [
            '-p', self.project_id, '-d', self.dataset_id, '-b',
            self.sandbox_dataset_id, '--data_stage', 'ehr'
        ]
        # creates argparse namespace return value
        expected_args = Namespace(
            **{
                'project_id': self.project_id,
                'dataset_id': self.dataset_id,
                'sandbox_dataset_id': self.sandbox_dataset_id,
                'data_stage': DataStage.EHR,
                'console_log': False,
                'list_queries': False
            })

        expected_kargs = {}
        mock_fetch_args.return_value = expected_args, expected_kargs

        rules = cc.DATA_STAGE_RULES_MAPPING['ehr']

        cc.main(args)

        mock_validate_args.assert_called_once_with(rules, **expected_kargs)

        mock_clean_dataset.assert_called_once_with(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            sandbox_dataset_id=self.sandbox_dataset_id,
            rules=rules,
            table_namer=DataStage.EHR.value)

        # Test get_queries() function call
        args = [
            '-p', self.project_id, '-d', self.dataset_id, '-b',
            self.sandbox_dataset_id, '--data_stage', 'ehr', '--list_queries',
            True
        ]

        expected_args = Namespace(
            **{
                'project_id': self.project_id,
                'dataset_id': self.dataset_id,
                'sandbox_dataset_id': self.sandbox_dataset_id,
                'data_stage': DataStage.EHR,
                'console_log': False,
                'list_queries': True
            })

        expected_kargs = {}
        mock_fetch_args.return_value = expected_args, expected_kargs

        cc.main(args)

        mock_get_query_list.assert_called_once_with(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            sandbox_dataset_id=self.sandbox_dataset_id,
            rules=rules,
            table_namer=DataStage.EHR.value)
Esempio n. 4
0
def create_tier(credentials_filepath, project_id, tier, input_dataset,
                release_tag, deid_stage, run_as, **kwargs):
    """
    This function is the main entry point for the deid process.
    It passes the required parameters to the implementing functions.

    :param credentials_filepath: filepath to credentials to access GCP
    :param project_id: project_id associated with the input dataset
    :param tier: controlled or registered tier intended for the output dataset
    :param input_dataset: name of the input dataset
    :param release_tag: release tag for dataset in the format of YYYYq#r#
    :param deid_stage: deid stage (deid, base or clean)
    :param run_as: email address of the service account to impersonate
    :return: name of created controlled or registered dataset
    """
    # validation of params
    validate_create_tier_args(tier, deid_stage, release_tag)

    # today's date for QA handoff
    qa_handoff_date = datetime.strftime(datetime.now(), '%Y-%m-%d')

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        run_as, SCOPES, credentials_filepath)

    client = bq.get_client(project_id, credentials=impersonation_creds)

    # Get Final Dataset name
    final_dataset_name = get_dataset_name(tier, release_tag, deid_stage)

    # Create intermediary datasets and copy tables from input dataset to newly created dataset
    datasets = create_datasets(client, final_dataset_name, input_dataset, tier,
                               release_tag)
    bq.copy_datasets(client, input_dataset, datasets[consts.STAGING])

    # Run cleaning rules
    cleaning_args = [
        '-p', project_id, '-d', datasets[consts.STAGING], '-b',
        datasets[consts.SANDBOX], '--data_stage', f'{tier}_tier_{deid_stage}'
    ]

    # Will update the qa_handoff_date to current date
    if 'base' in deid_stage:
        versions = add_cdr_metadata.get_etl_version(datasets[consts.STAGING],
                                                    project_id)
        if not versions:
            raise RuntimeError(
                'etl version does not exist, make sure _cdr_metadata table was created in combined step'
            )
        add_cdr_metadata.main([
            '--component', add_cdr_metadata.INSERT, '--project_id', project_id,
            '--target_dataset', datasets[consts.STAGING], '--qa_handoff_date',
            qa_handoff_date, '--etl_version', versions[0]
        ])
    else:
        LOGGER.info(
            f'deid_stage was not base, no data inserted into _cdr_metadata table'
        )

    controlled_tier_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs)
    clean_cdr.main(args=controlled_tier_cleaning_args)

    # Snapshot the staging dataset to final dataset
    create_schemaed_snapshot_dataset(project_id, datasets[consts.STAGING],
                                     final_dataset_name, False)

    return datasets