Beispiel #1
0
    def createBigQuery():

        biguery_name = 'quotedataset'
        bigquery_description = 'this is a quotedataset test'

        client = bigquery.Client()
        dataset_ref = client.dataset(biguery_name)
        dataset = Dataset(dataset_ref)
        dataset.description = bigquery_description
        dataset = client.create_dataset(dataset)  # API request

        print('Dataset {} created.'.format(dataset.dataset_id))

        table_id = 'quotes_table'

        table_ref = dataset_ref.table(table_id)
        table = bigquery.Table(table_ref)

        table.schema = (
            bigquery.SchemaField('Quote', 'STRING', 'REPEATED'),
            bigquery.SchemaField('Author', 'STRING', 'REPEATED'),
            bigquery.SchemaField('Tags', 'STRING', 'REPEATED'),
        )

        table = client.create_table(table)
        print('Created table {} in dataset {}.'.format(table_id, biguery_name))
Beispiel #2
0
def upload_tweets():
    big_query_client = bigquery.Client.from_service_account_json('my-beam-project-b2834963a4ae.json')

    dataset_ref = big_query_client.dataset('Tweets')
    dataset = Dataset(dataset_ref)
    dataset.description = 'This represents tweets of trending topics'
    dataset = big_query_client.create_dataset(dataset)

    SCHEMA = [
        SchemaField('Tweets', 'STRING', mode='Nullable'),

    ]
    table_ref = big_query_client.dataset('Tweets').table('tabletweet')

    load_config = LoadJobConfig()
    load_config.skip_leading_rows = 0
    load_config.schema = SCHEMA
    load_config.allow_quoted_newlines = True
    load_config.ignore_unknown_values = False
    load_config.max_bad_records = 200


    with open('tweets.csv', 'rb') as readable:
        big_query_client.load_table_from_file(
            readable, table_ref, job_config=load_config)
    print('tweets file uploaded to big query')
Beispiel #3
0
def load(project_id, bq_client, src_dataset_id, dst_dataset_id):
    """
    Transform safely loaded tables and store results in target dataset.

    :param project_id: Identifies the BQ project
    :param bq_client: a BigQuery client object
    :param src_dataset_id: reference to source dataset object
    :param dst_dataset_id: reference to destination dataset object
    :return: List of BQ job_ids
    """
    dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}')
    dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}'
    dst_dataset.labels = {'type': 'vocabulary'}
    dst_dataset.location = "US"
    bq_client.create_dataset(dst_dataset, exists_ok=True)
    src_tables = list(bq_client.list_tables(dataset=src_dataset_id))

    job_config = QueryJobConfig()
    query_jobs = []
    for src_table in src_tables:
        schema = bq.get_table_schema(src_table.table_id)
        destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}'
        table = bq_client.create_table(Table(destination, schema=schema),
                                       exists_ok=True)
        job_config.destination = table
        query = SELECT_TPL.render(project_id=project_id,
                                  dataset_id=src_dataset_id,
                                  table=src_table.table_id,
                                  fields=schema)
        query_job = bq_client.query(query, job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{query_job.job_id}')
        query_jobs.append(query_job)
        query_job.result()
    return query_jobs
def test_dataflow_event_big_query_success(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using wiretap, including a query successfully read event.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id'
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()
    events_wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination
    google_bigquery >= events_wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=SCHEMA))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\
            f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}'
        rows_from_wiretap = get_rows_from_wiretap(wiretap)
        assert rows_from_wiretap == ROWS_TO_INSERT

        # We have exactly one output record, check that it is a big-query-success event
        assert len(
            events_wiretap.output_records
        ) == 1, f'Expected 1 records, received {len(events_wiretap.output_records)}'
        event_record = events_wiretap.output_records[0]
        event_type = event_record.header.values['sdc.event.type']
        assert event_type == 'big-query-success', 'Received %s as event type (expected new-file)' % event_type
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Beispiel #5
0
def test_multiple_batch(sdc_builder, sdc_executor, gcp, number_batches):
    """
    Create data using Google BigQuery destination using different batch sizes
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    number_records = 1000
    batch_size = number_records//number_batches

    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_data_generator = pipeline_builder.add_stage('Dev Data Generator')
    dev_data_generator.set_attributes(batch_size=batch_size,
                                      fields_to_generate=[
                                          {"type": "INTEGER", "field": "age"},
                                          {"type": "STRING", "field": "full_name"}
                                      ])

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_data_generator >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_output_records_count(number_records)
        sdc_executor.stop_pipeline(pipeline)

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert data_from_bigquery == data_from_bigquery

        history = sdc_executor.get_pipeline_history(pipeline)
        records = history.latest.metrics.counter('pipeline.batchInputRecords.counter').count

        assert len(data_from_bigquery) == records
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_data_type(sdc_builder, sdc_executor, gcp, data_type, data,
                   expected_data):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using wiretap with one data format each.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name}'
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('id', 'INTEGER', mode='NULLABLE'),
        SchemaField('data', data_type, mode='NULLABLE')
    ]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, [(None, data)])
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(
            wiretap.output_records
        ) == 1, f'Expected 1 record, received {len(wiretap.output_records)}'
        received_data = wiretap.output_records[0].field['data']

        assert received_data == expected_data
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Beispiel #7
0
def createDataset(datasetname):
    dataset_ref = bigquery_client.dataset(datasetname)

    dataset = Dataset(dataset_ref)
    dataset.location = 'US'

    dataset = bigquery_client.create_dataset(dataset)

    return
Beispiel #8
0
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp):
    """Verify proper batch handling by the BigQuery origin.

    In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of
    size 3 and that 8 total records are captured. The small numbers are used because of the limitations
    described in SDC-8765.
    """
    MAX_BATCH_SIZE = 3
    TOTAL_RECORDS = 8

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number'

    pipeline_builder = sdc_builder.get_pipeline_builder()
    google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin')
    google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE)
    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('number', 'STRING', mode='required'),
              SchemaField('value', 'STRING', mode='required')]

    data = [dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS)]
    dataset = Dataset(bigquery_client.dataset(dataset_name))

    try:
        # Using Google bigquery client, create dataset, table and data inside table.
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset)
        table = bigquery_client.create_table(Table(dataset.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, data)
        assert not errors

        # Ceiling division is needed to capture all the complete batches along with the last partial one.
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True,
                                                 batches=math.ceil(TOTAL_RECORDS / MAX_BATCH_SIZE)).snapshot

        # Assert that the batch size is being respected in each batch (including the last). In this case,
        # we'd expect batch sizes of 3, 3, and 2.
        for i, batch in enumerate(snapshot.snapshot_batches, start=1):
            #  for 8 records, we'd expect batch sizes of 3, 3, and 2.
            assert (len(batch.stage_outputs[google_bigquery.instance_name].output) == MAX_BATCH_SIZE
                    if i * MAX_BATCH_SIZE <= TOTAL_RECORDS
                    else TOTAL_RECORDS % MAX_BATCH_SIZE)

        all_records = [record.field
                       for batch in snapshot.snapshot_batches
                       for record in batch.stage_outputs[google_bigquery.instance_name].output]
        assert all_records == data
    finally:
        bigquery_client.delete_dataset(dataset, delete_contents=True)
Beispiel #9
0
def test_google_bigquery_destination(sdc_builder, sdc_executor, gcp):
    """
    Send data to Google BigQuery from Dev Raw Data Source and
    confirm that Google BigQuery destination successfully recieves them using Google BigQuery client.

    This is achieved by using a deduplicator which assures that there is only one ingest to Google BigQuery.
    The pipeline looks like:
        dev_raw_data_source >> record_deduplicator >> google_bigquery
                               record_deduplicator >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       raw_data='\n'.join(CSV_DATA_TO_INSERT))

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    record_deduplicator = pipeline_builder.add_stage('Record Deduplicator')
    trash = pipeline_builder.add_stage('Trash')

    dev_raw_data_source >> record_deduplicator >> google_bigquery
    record_deduplicator >> trash

    pipeline = pipeline_builder.build(title='Google BigQuery Destination').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('full_name', 'STRING', mode='required'),
              SchemaField('age', 'INTEGER', mode='required')]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

    try:
        logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        logger.info('Starting BigQuery Destination pipeline and waiting for it to produce records ...')
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1)

        logger.info('Stopping BigQuery Destination pipeline and getting the count of records produced in total ...')
        sdc_executor.stop_pipeline(pipeline)

        # Verify by reading records using Google BigQuery client
        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()
        logger.debug('read_data = {}'.format(data_from_bigquery))
        assert ROWS_EXPECTED == data_from_bigquery
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Beispiel #10
0
def test_object_names_columns(sdc_builder, sdc_executor, gcp, column_name):
    """
    Create data using Google BigQuery destination with different column names
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    data_to_insert = [(1, 'data')]
    rows_to_insert = [f'id,{column_name}'] + [','.join(str(element) for element in row) for row in data_to_insert]
    data_to_expect = [(1, 'data')]
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(rows_to_insert))

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)

    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_raw_data_source >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'),
              SchemaField(column_name, 'STRING', mode='NULLABLE')]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert data_to_expect == data_from_bigquery
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Beispiel #11
0
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using snapshot.

    The pipeline looks like:
        google_bigquery >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY full_name'
    google_bigquery.set_attributes(query=query_str)

    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash
    pipeline = pipeline_builder.build(
        title='Google BigQuery').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('full_name', 'STRING', mode='required'),
        SchemaField('age', 'INTEGER', mode='required')
    ]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))
    try:
        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert errors == []

        # Start pipeline and verify correct rows are received using snaphot.
        logger.info('Starting pipeline and snapshot')
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot
        if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
        rows_from_snapshot = [(record.value['value'][0]['value'],
                               int(record.value['value'][1]['value']))
                              for record in snapshot[google_bigquery].output]

        assert rows_from_snapshot == ROWS_TO_INSERT
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Beispiel #12
0
def create_bq_dataset(dataset_name='price_data'):
    '''Create dataset if not exists'''
    client = Client()
    datasets = [
        client.project + "." + i.dataset_id
        for i in list(client.list_datasets())
    ]
    if client.project + "." + dataset_name not in datasets:
        dataset = Dataset(dataset_name)
        dataset.location = "US"
        client.create_dataset(dataset)
    else:
        print("Dataset already exists")
    def __create_dataset(self, data_set, data_set_description):
        """
        This method create a DataSet in Google's Big Query.
        :param data_set: Name of the dataSet 
        :type data_set: String

        :param data_set_description: Short description about dataSet.
        :type data_set_description: String.
        """
        dataset_ref = self.client.dataset(data_set)
        dataset = Dataset(dataset_ref)
        dataset.description = data_set_description
        dataset = self.client.create_dataset(dataset)  # API request
Beispiel #14
0
 def setUp(self):
     project_id = 'fake_project_id'
     dataset_id = 'fake_dataset_id'
     bucket_name = 'fake_bucket'
     dataset_ref = DatasetReference(project_id, dataset_id)
     self.dst_dataset = Dataset(dataset_ref)
     self.bq_client = mock.MagicMock()
     self.gcs_client = mock.MagicMock()
     self.bucket_name = bucket_name
     self.all_blobs = [
         Blob(f'{table}.csv', self.bucket_name)
         for table in common.VOCABULARY_TABLES
     ]
Beispiel #15
0
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name):
    """
    Create data using Google BigQuery destination with different dataset names
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(CSV_DATA_TO_INSERT))

    table_name = get_random_string(string.ascii_letters, 5)
    # If tests fail for any reason, leftovers with equal names might lead to more errors
    dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}'

    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_raw_data_source >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert ROWS_EXPECTED == data_from_bigquery
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name):
    """
    Create data using Google BigQuery client with specific table names
    and then check if Google BigQuery origin receives them using wiretap.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    table_name = get_random_string(string.ascii_letters, 5)
    dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}'
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f"SELECT * FROM `{dataset_name}`.`{table_name}` ORDER BY id"
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=SCHEMA))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\
            f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}'
        rows_from_wiretap = get_rows_from_wiretap(wiretap)
        assert rows_from_wiretap == ROWS_TO_INSERT
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
    def create_external_table_hive_partitioning(
            bq_client: bigquery.Client, dataset: bigquery.Dataset,
            table_id: str, gcs_directory_path: str) -> bigquery.Table:
        """
            Creates an external table with AUTO hive partitioning in GCS
        :param bq_client: Client object to bigquery
        :param dataset: dataset object. Check 'get_or_create_dataset' method
        :param table_id: Table to be created
        :param gcs_directory_path: Directory of GCS with the data. For example:
            If you have a structure like this:
            "gs://bucket/images_metadata/source_id=abc/date=2018-02-20"
            You should pass:
            "******"
        :return:
        """
        table = bigquery.Table(dataset.table(table_id))

        external_config = bigquery.ExternalConfig(
            bigquery.SourceFormat.PARQUET)
        external_config.source_uris = [f"{gcs_directory_path}/*"]
        hive_part_opt = HivePartitioningOptions()
        hive_part_opt.mode = "AUTO"
        hive_part_opt.source_uri_prefix = gcs_directory_path

        external_config.hive_partitioning = hive_part_opt
        table.external_data_configuration = external_config
        table = bq_client.create_table(table, exists_ok=True)

        return table
Beispiel #18
0
    def _create_brand_new_datasets(self, product_id):
        """
    Creates a new Google BigQuery dataset

    Args:
      product_id (str):
        A string representing the desired product_id 

    """
        dataset = Dataset(self.client.dataset(dataset_id=product_id))
        dataset.location = _DATASET_GEOLOCATION

        self.datasets[product_id]['datasets'] = (
            self.client.create_dataset(dataset))
        self._create_brand_new_tables(product_id=product_id)
        return
    def prepare_table(self, dataset: bigquery.Dataset, table_name: str,
                      columns_schema: list,
                      incremental: bool) -> bigquery.TableReference:
        table_reference = dataset.table(table_name)
        table = bigquery.Table(table_reference, columns_schema)

        try:
            bq_table = self.bigquery_client.get_table(table_reference)
            table_exist = True
            if incremental:
                schema_mapper.is_table_definition_in_match_with_bigquery(
                    columns_schema, bq_table)
            else:
                self.bigquery_client.delete_table(table_reference)
                table_exist = False
        except bq_exceptions.NotFound:
            table_exist = False
        except bq_exceptions.BadRequest as err:
            message = 'Cannot create table %s: %s' % (table_reference,
                                                      str(err))
            raise UserException(message)

        if not table_exist:
            try:
                self.bigquery_client.create_table(table)
            except bq_exceptions.BadRequest as err:
                message = 'Cannot create table %s: %s' % (table_name, str(err))
                raise UserException(message)
        return table_reference
Beispiel #20
0
    def test_table_lookup(self):
        # type: () -> None
        dataset_ref1 = DatasetReference('my_project', 'dataset1')
        table_ref1 = TableReference(dataset_ref1, 'table1')
        table1 = Table(table_ref1, _TEST_SCHEMA)

        # Trying to get the same dataset/table in another project doesn't work.
        with self.assertRaisesRegexp(NotFound, 'other_project'):
            self.bq_client.get_table(
                TableReference(DatasetReference('other_project', 'dataset1'),
                               'table1'))

        # Trying to get the table before the dataset exists doesn't work
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.get_table(table_ref1)
        self.bq_client.create_dataset(Dataset(dataset_ref1))

        # Trying to get the table before the table exists doesn't work
        with self.assertRaises(NotFound):
            self.bq_client.get_table(table_ref1)

        self.bq_client.create_table(table1)

        # Assert the created table has the expected properties.
        table_found = self.bq_client.get_table(table_ref1)
        self.assertEqual(table1.project, "my_project")
        self.assertEqual(table1.dataset_id, "dataset1")
        self.assertEqual(table1.table_id, "table1")
        six.assertCountEqual(self, table_found.schema, _TEST_SCHEMA)
Beispiel #21
0
def load_views(bq: bigquery.Client, dataset: bigquery.Dataset,
               views: Dict[str, str]):
    """Load views for a test."""
    for table, view_query in views.items():
        view = bigquery.Table(dataset.table(table))
        view.view_query = view_query.format(project=dataset.project,
                                            dataset=dataset.dataset_id)
        bq.create_table(view)
 def test_listing_datasets(self):
     # type: () -> None
     self.assertFalse(self.bq_client.list_datasets())
     self.assertFalse(self.bq_client.list_datasets('my_project'))
     self.bq_client.create_dataset(Dataset(DatasetReference('my_project', 'dataset1')))
     self.bq_client.create_dataset(Dataset(DatasetReference('my_project', 'dataset2')))
     self.bq_client.create_dataset(Dataset(DatasetReference('other_project', 'dataset3')))
     six.assertCountEqual(self,
                          [dataset.dataset_id for dataset in self.bq_client.list_datasets()],
                          ['dataset1', 'dataset2'])
     six.assertCountEqual(
             self,
             [dataset.dataset_id for dataset in self.bq_client.list_datasets('my_project')],
             ['dataset1', 'dataset2'])
     six.assertCountEqual(
             self,
             [dataset.dataset_id for dataset in self.bq_client.list_datasets('other_project')],
             ['dataset3'])
Beispiel #23
0
def check_and_create_staging_dataset(dst_dataset_id, bucket_name, bq_client):
    """

    :param dst_dataset_id: final destination to load the vocabulary in BigQuery
    :param bucket_name: the location in GCS containing the vocabulary files
    :param bq_client: google bigquery client
    :return: staging dataset object
    """
    staging_dataset_id = f'{dst_dataset_id}_staging'
    staging_dataset = Dataset(f'{bq_client.project}.{staging_dataset_id}')
    try:
        bq_client.get_dataset(staging_dataset)
    except NotFound:
        staging_dataset.description = f'Vocabulary loaded from gs://{bucket_name}'
        staging_dataset.labels = {'type': 'vocabulary', 'phase': 'staging'}
        staging_dataset.location = "US"
        staging_dataset = bq_client.create_dataset(staging_dataset)
        LOGGER.info(f'Successfully created dataset {staging_dataset_id}')
    return staging_dataset
    def create_data_set(self, data_set_name):
        """
        :param data_set_name: str - The name of the dataset to be created
        :return: 0 indicates success
        """

        data_set_ref = self.client.dataset(data_set_name)
        data_set = Dataset(data_set_ref)
        data_set.description = ''
        data_set.location = 'EU'

        try:
            self.client.create_dataset(data_set)  # API request
            logging.info('Data set - ' + data_set_name +
                         ' successfully created')
        except Conflict:
            logging.info('Data set - ' + data_set_name + ' already exists')

        return 0
 def test_listing_tables(self):
     # type: () -> None
     dataset_ref1 = DatasetReference('my_project', 'dataset1')
     self.bq_client.create_dataset(Dataset(dataset_ref1))
     self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table1'), _TEST_SCHEMA))
     self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table2'), []))
     six.assertCountEqual(
         self,
         [table_ref.table_id for table_ref in self.bq_client.list_tables(dataset_ref1)],
         ['table1', 'table2'])
 def test_listing_tables_with_max(self):
     # type: () -> None
     dataset_ref1 = DatasetReference('my_project', 'dataset1')
     self.bq_client.create_dataset(Dataset(dataset_ref1))
     for i in range(10):
         self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table{}'.format(i)),
                                           _TEST_SCHEMA))
     self.assertEqual(5, len(self.bq_client.list_tables(dataset_ref1, max_results=5)))
     self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1, max_results=20)))
     self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1)))
    def test_delete_dataset_if_exists_WhenItIsNotEmpty(self):
        dataset_id = self.__generate_test_name("dataset")
        table_name = self.__generate_test_name("table")
        self.client.create_dataset_if_not_exists(dataset_id)
        self.__create_test_table(
            table_name,
            Dataset(DatasetReference(GCP_PROJECT_ID, dataset_id)).reference)

        self.client.delete_dataset_if_exists(dataset_id, delete_contents=True)

        self.assertFalse(self.client.dataset_exists(dataset_id))
Beispiel #28
0
def ensure_dataset(project_id, dataset_id, location):
    client = bigquery.Client(project=project_id, location=location)

    dataset_ref = client.dataset(dataset_id)
    try:
        client.create_dataset(dataset_ref, exists_ok=True)
    except Exception:
        # attempt to run even if creation fails due to permissions etc.
        pass

    return client, Dataset(dataset_ref)
Beispiel #29
0
def create_scratch_dataset(
        context: AbstractComputeExecutionContext) -> HcaScratchDatasetName:
    """
    Creates a staging dataset that will house records for update/insertion into the
    final TDR dataset
    :return: Name of the staging dataset
    """
    scratch_bq_project = context.resources.scratch_config.scratch_bq_project
    scratch_dataset_prefix = context.resources.scratch_config.scratch_dataset_prefix
    load_tag = context.resources.load_tag

    dataset_name = f"{scratch_bq_project}.{scratch_dataset_prefix}_{load_tag}"

    dataset = Dataset(dataset_name)
    dataset.default_table_expiration_ms = context.resources.scratch_config.scratch_table_expiration_ms

    bq_client = context.resources.bigquery_client
    bq_client.create_dataset(dataset)

    context.log.info(f"Created scratch dataset {dataset_name}")

    return HcaScratchDatasetName(dataset_name)
Beispiel #30
0
    def create(self, dataset_id):
        """ Create a dataset in Google BigQuery

        Parameters
        ----------
        dataset : str
            Name of dataset to be written
        """
        from google.cloud.bigquery import Dataset

        if self.exists(dataset_id):
            raise DatasetCreationError("Dataset {0} already "
                                       "exists".format(dataset_id))

        dataset = Dataset(self.client.dataset(dataset_id))

        if self.location is not None:
            dataset.location = self.location

        try:
            self.client.create_dataset(dataset)
        except self.http_error as ex:
            self.process_http_error(ex)