Beispiel #1
0
def create_datasets(client, rdr_dataset, release_tag):
    rdr_clean = f'{release_tag}_rdr'
    rdr_staging = f'{rdr_clean}_staging'
    rdr_sandbox = f'{rdr_clean}_sandbox'

    staging_desc = f'Intermediary dataset to apply cleaning rules on {rdr_dataset}'
    labels = {
        "phase": "staging",
        "release_tag": release_tag,
        "de_identified": "false"
    }
    staging_dataset_object = bq.define_dataset(client.project, rdr_staging,
                                               staging_desc, labels)
    client.create_dataset(staging_dataset_object)
    LOGGER.info(f'Created dataset `{client.project}.{rdr_staging}`')

    sandbox_desc = (f'Sandbox created for storing records affected by the '
                    f'cleaning rules applied to {rdr_staging}')
    labels["phase"] = "sandbox"
    sandbox_dataset_object = bq.define_dataset(client.project, rdr_sandbox,
                                               sandbox_desc, labels)
    client.create_dataset(sandbox_dataset_object)
    LOGGER.info(f'Created dataset `{client.project}.{rdr_sandbox}`')

    version = 'implement getting software version'
    clean_desc = (f'{version} clean version of {rdr_dataset}')
    labels["phase"] = "clean"
    clean_dataset_object = bq.define_dataset(client.project, rdr_clean,
                                             clean_desc, labels)
    client.create_dataset(clean_dataset_object)
    LOGGER.info(f'Created dataset `{client.project}.{rdr_clean}`')

    return {'clean': rdr_clean, 'staging': rdr_staging, 'sandbox': rdr_sandbox}
Beispiel #2
0
    def test_define_dataset(self):
        # Tests if project_id is given
        self.assertRaises(RuntimeError, bq.define_dataset, None,
                          self.dataset_id, self.description,
                          self.existing_labels_or_tags)

        # Tests if dataset_id is given
        self.assertRaises(RuntimeError, bq.define_dataset, self.project_id,
                          None, self.description, self.existing_labels_or_tags)

        # Tests if description is given
        self.assertRaises(RuntimeError, bq.define_dataset, self.project_id,
                          self.dataset_id, (None or ''),
                          self.existing_labels_or_tags)

        # Tests if no label or tag is given
        self.assertRaises(RuntimeError, bq.define_dataset, self.project_id,
                          self.dataset_id, self.description, None)

        # Pre-conditions
        results = bq.define_dataset(self.project_id, self.dataset_id,
                                    self.description,
                                    self.existing_labels_or_tags)

        # Post conditions
        self.assertIsInstance(results, bigquery.Dataset)
        self.assertEqual(results.labels, self.existing_labels_or_tags)
        def setUpClass(cls):
            # get the test project
            if 'test' not in cls.project_id:
                raise RuntimeError(
                    f'Tests should only run in a test environment.  '
                    f'Current environment is {cls.project_id} .')

            if not cls.fq_table_names:
                raise RuntimeError(
                    f'Provide a list of fully qualified table names the '
                    f'test will manipulate.')

            cls.client = bq.get_client(cls.project_id)

            # get or create datasets, cleaning rules can assume the datasets exist
            required_datasets = []
            for table_name in cls.fq_table_names + cls.fq_sandbox_table_names:
                dataset_id = table_name.split('.')[1]
                required_datasets.append(dataset_id)

            desc = (f"dataset created by {cls.__name__} to test a "
                    f"cleaning rule.  deletion candidate.")
            for dataset_id in set(required_datasets):
                dataset = bq.define_dataset(cls.project_id, dataset_id, desc,
                                            {'test': ''})
                cls.client.create_dataset(dataset, exists_ok=True)
Beispiel #4
0
def main(raw_args=None):
    """
    Run a full RDR import.

    Assumes you are passing arguments either via command line or a
    list.
    """
    args = parse_rdr_args(raw_args)

    pipeline_logging.configure(level=logging.INFO,
                               add_console_handler=args.console_log)

    description = f'RDR DUMP loaded from {args.bucket} dated {args.export_date}'
    export_date = args.export_date.replace('-', '')
    new_dataset_name = f'rdr{export_date}'

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.curation_project_id,
                           credentials=impersonation_creds)

    dataset_object = bq.define_dataset(client.project, new_dataset_name,
                                       description,
                                       {'export_date': args.export_date})
    client.create_dataset(dataset_object)

    create_rdr_tables(client, new_dataset_name, args.bucket)
    copy_vocab_tables(client, new_dataset_name, args.vocabulary)
Beispiel #5
0
 def test_define_dataset(self):
     self.assertRaises(RuntimeError, bq.define_dataset, None,
                       self.dataset_id, self.description, self.label_or_tag)
     self.assertRaises(RuntimeError, bq.define_dataset, '', self.dataset_id,
                       self.description, self.label_or_tag)
     self.assertRaises(RuntimeError, bq.define_dataset, self.project_id,
                       False, self.description, self.label_or_tag)
     self.assertRaises(RuntimeError, bq.define_dataset, self.project_id,
                       self.dataset_id, ' ', self.label_or_tag)
     self.assertRaises(RuntimeError, bq.define_dataset, self.project_id,
                       self.dataset_id, self.description, None)
     dataset = bq.define_dataset(self.project_id, self.dataset_id,
                                 self.description, self.label_or_tag)
     self.assertEqual(dataset.dataset_id, self.dataset_id)
Beispiel #6
0
def create_dataset(project, dataset_id, description, tags, app_creds):
    """
    Create a dataset with the given  parameters.

    :param project:  The project_id used to define the dataset.
    :param dataset_id: The string to name the dataset with.
    :param description: A string to use to describe the dataset.
    :param tags: The list of tags/labels to apply to the dataset.
    :parm app_creds: Filepath to credentials file used to create the dataset
    """
    # Construct a full Dataset object to send to the API.
    dataset = bq.define_dataset(project, dataset_id, description, tags)

    client = get_client(project, app_creds)
    dataset = client.create_dataset(dataset, exists_ok=True)
    print(f"Created dataset {project}.{dataset_id}")
Beispiel #7
0
def create_fitbit_datasets(client, release_tag):
    """
    Creates staging, sandbox, backup and clean datasets with descriptions and labels

    :param client: bq client
    :param release_tag: string of the form "YYYYqNrN"
    :return: dict of dataset names with keys 'clean', 'backup', 'staging', 'sandbox'
    """
    fitbit_datasets = {
        consts.CLEAN: f'{release_tag}_fitbit',
        consts.BACKUP: f'{release_tag}_fitbit_backup',
        consts.STAGING: f'{release_tag}_fitbit_staging',
        consts.SANDBOX: f'{release_tag}_fitbit_sandbox'
    }

    fitbit_desc = {
        consts.CLEAN:
            f'Cleaned version of {fitbit_datasets[consts.BACKUP]}',
        consts.BACKUP:
            f'Backup dataset during generation of {fitbit_datasets[consts.STAGING]}',
        consts.STAGING:
            f'Intermediary dataset to apply cleaning rules on {fitbit_datasets[consts.BACKUP]}',
        consts.SANDBOX:
            (f'Sandbox created for storing records affected by the '
             f'cleaning rules applied to {fitbit_datasets[consts.STAGING]}'),
    }

    for phase in fitbit_datasets:
        labels = {
            "phase": phase,
            "release_tag": release_tag,
            "de_identified": "false"
        }
        dataset_object = bq.define_dataset(client.project,
                                           fitbit_datasets[phase],
                                           fitbit_desc[phase], labels)
        client.create_dataset(dataset_object)
        LOGGER.info(
            f'Created dataset `{client.project}.{fitbit_datasets[phase]}`')

    return fitbit_datasets
Beispiel #8
0
    def test_define_dataset(self):
        # Tests if project_id is given
        self.assertRaises(TypeError, define_dataset, self.dataset_id,
                          self.description, self.label_or_tag)

        # Tests if dataset_id is given
        self.assertRaises(TypeError, define_dataset, self.project_id,
                          self.description, self.label_or_tag)

        # Tests if description is given
        self.assertRaises(TypeError, define_dataset, self.project_id,
                          self.dataset_id, self.label_or_tag)

        # Tests if no label or tag is given
        self.assertRaises(TypeError, define_dataset, self.project_id,
                          self.dataset_id, self.description)

        # Pre-conditions
        results = define_dataset(self.project_id, self.dataset_id,
                                 self.description, self.label_or_tag)

        # Post conditions
        self.assertIsInstance(results, bigquery.Dataset)
        self.assertEqual(results.labels, self.label_or_tag)
Beispiel #9
0
def create_datasets(client, name, input_dataset, tier, release_tag):
    """
    Creates backup, staging, sandbox, and final datasets with the proper descriptions
    and tag/labels applied

    :param client: an instantiated bigquery client object
    :param name: the base name of the datasets to be created
    :param input_dataset: name of the input dataset
    :param tier: tier parameter passed through from either a list or command line argument
    :param release_tag: release tag parameter passed through either the command line arguments
    :return: tuple of created dataset names
    """

    if not client:
        raise RuntimeError("Please specify BigQuery client object")
    if not name:
        raise RuntimeError(
            "Please specify the base name of the datasets to be created")
    if not input_dataset:
        raise RuntimeError("Please specify the name of the input dataset")
    if not tier:
        raise RuntimeError(
            "Please specify the tier intended for the output datasets")
    if not release_tag:
        raise RuntimeError(
            "Please specify the release tag for the dataset in the format of YYYY#q#r"
        )

    # Construct names of datasets need as part of the deid process
    final_dataset_id = name
    backup_dataset_id = f'{name}_{consts.BACKUP}'
    staging_dataset_id = f'{name}_{consts.STAGING}'
    sandbox_dataset_id = f'{name}_{consts.SANDBOX}'

    datasets = {
        consts.CLEAN: final_dataset_id,
        consts.BACKUP: backup_dataset_id,
        consts.STAGING: staging_dataset_id,
        consts.SANDBOX: sandbox_dataset_id
    }

    deid_datasets = [final_dataset_id, staging_dataset_id]

    # base labels and tags for the datasets
    base_labels_and_tags = {'release_tag': release_tag, 'data_tier': tier}

    description = f'dataset created from {input_dataset} for {tier}{release_tag} CDR run'

    # Creation of dataset objects and dataset label and description updates
    for phase, dataset_id in datasets.items():
        dataset_object = bq.define_dataset(client.project, dataset_id,
                                           description, base_labels_and_tags)
        client.create_dataset(dataset_object, exists_ok=True)
        dataset = bq.get_dataset(client.project, dataset_id)
        if dataset_id in deid_datasets:
            new_labels = bq.update_labels_and_tags(dataset_id,
                                                   base_labels_and_tags, {
                                                       'phase': phase,
                                                       'de-identified': 'true'
                                                   })
            dataset.labels = new_labels
            dataset.description = f'{phase} {description}'
            client.update_dataset(dataset, ["labels", "description"])
        else:
            new_labels = bq.update_labels_and_tags(dataset_id,
                                                   base_labels_and_tags, {
                                                       'phase': phase,
                                                       'de-identified': 'false'
                                                   })
            dataset.labels = new_labels
            dataset.description = f'{phase} {description}'
            client.update_dataset(dataset, ["labels", "description"])

    # Copy input dataset tables to backup and staging datasets
    tables = client.list_tables(input_dataset)
    for table in tables:
        backup_table = f'{backup_dataset_id}.{table.table_id}'
        staging_table = f'{staging_dataset_id}.{table.table_id}'
        client.copy_table(table, backup_table)
        client.copy_table(table, staging_table)

    return datasets
    #Create dataset with labels
    output_dataset_name = get_dataset_name(args.tier, args.release_tag,
                                           args.deid_stage)
    description = f'{args.deid_stage} dataset created from {args.src_dataset_id} for {args.tier}{args.release_tag} CDR run'
    labels = {
        'clean': 'yes' if args.deid_stage == 'clean' else 'no',
        'data_tier': args.tier.lower(),
        'release_tag': args.release_tag.lower()
    }

    LOGGER.info(
        f'Creating dataset {output_dataset_name} in {args.output_prod_project_id}...'
    )
    dataset_object = bq.define_dataset(args.output_prod_project_id,
                                       output_dataset_name, description,
                                       labels)
    client.create_dataset(dataset_object, exists_ok=False)

    #Copy tables from source to destination
    LOGGER.info(
        f'Copying tables from dataset {args.src_project_id}.{args.src_dataset_id} to {args.output_prod_project_id}.{output_dataset_name}...'
    )
    bq.copy_datasets(client, f'{args.src_project_id}.{args.src_dataset_id}',
                     f'{args.output_prod_project_id}.{output_dataset_name}')

    #Append extra columns to person table
    LOGGER.info(f'Appending extract columns to the person table...')
    update_person(client, args.output_prod_project_id, output_dataset_name)

    LOGGER.info(f'Completed successfully.')