def create_external_table_hive_partitioning(
            bq_client: bigquery.Client, dataset: bigquery.Dataset,
            table_id: str, gcs_directory_path: str) -> bigquery.Table:
        """
            Creates an external table with AUTO hive partitioning in GCS
        :param bq_client: Client object to bigquery
        :param dataset: dataset object. Check 'get_or_create_dataset' method
        :param table_id: Table to be created
        :param gcs_directory_path: Directory of GCS with the data. For example:
            If you have a structure like this:
            "gs://bucket/images_metadata/source_id=abc/date=2018-02-20"
            You should pass:
            "******"
        :return:
        """
        table = bigquery.Table(dataset.table(table_id))

        external_config = bigquery.ExternalConfig(
            bigquery.SourceFormat.PARQUET)
        external_config.source_uris = [f"{gcs_directory_path}/*"]
        hive_part_opt = HivePartitioningOptions()
        hive_part_opt.mode = "AUTO"
        hive_part_opt.source_uri_prefix = gcs_directory_path

        external_config.hive_partitioning = hive_part_opt
        table.external_data_configuration = external_config
        table = bq_client.create_table(table, exists_ok=True)

        return table
Ejemplo n.º 2
0
    def prepare_table(self, dataset: bigquery.Dataset, table_name: str,
                      columns_schema: list,
                      incremental: bool) -> bigquery.TableReference:
        table_reference = dataset.table(table_name)
        table = bigquery.Table(table_reference, columns_schema)

        try:
            bq_table = self.bigquery_client.get_table(table_reference)
            table_exist = True
            if incremental:
                schema_mapper.is_table_definition_in_match_with_bigquery(
                    columns_schema, bq_table)
            else:
                self.bigquery_client.delete_table(table_reference)
                table_exist = False
        except bq_exceptions.NotFound:
            table_exist = False
        except bq_exceptions.BadRequest as err:
            message = 'Cannot create table %s: %s' % (table_reference,
                                                      str(err))
            raise UserException(message)

        if not table_exist:
            try:
                self.bigquery_client.create_table(table)
            except bq_exceptions.BadRequest as err:
                message = 'Cannot create table %s: %s' % (table_name, str(err))
                raise UserException(message)
        return table_reference
def test_dataflow_event_big_query_success(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using wiretap, including a query successfully read event.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id'
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()
    events_wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination
    google_bigquery >= events_wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=SCHEMA))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\
            f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}'
        rows_from_wiretap = get_rows_from_wiretap(wiretap)
        assert rows_from_wiretap == ROWS_TO_INSERT

        # We have exactly one output record, check that it is a big-query-success event
        assert len(
            events_wiretap.output_records
        ) == 1, f'Expected 1 records, received {len(events_wiretap.output_records)}'
        event_record = events_wiretap.output_records[0]
        event_type = event_record.header.values['sdc.event.type']
        assert event_type == 'big-query-success', 'Received %s as event type (expected new-file)' % event_type
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 4
0
def load_views(bq: bigquery.Client, dataset: bigquery.Dataset,
               views: Dict[str, str]):
    """Load views for a test."""
    for table, view_query in views.items():
        view = bigquery.Table(dataset.table(table))
        view.view_query = view_query.format(project=dataset.project,
                                            dataset=dataset.dataset_id)
        bq.create_table(view)
Ejemplo n.º 5
0
def test_multiple_batch(sdc_builder, sdc_executor, gcp, number_batches):
    """
    Create data using Google BigQuery destination using different batch sizes
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    number_records = 1000
    batch_size = number_records//number_batches

    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_data_generator = pipeline_builder.add_stage('Dev Data Generator')
    dev_data_generator.set_attributes(batch_size=batch_size,
                                      fields_to_generate=[
                                          {"type": "INTEGER", "field": "age"},
                                          {"type": "STRING", "field": "full_name"}
                                      ])

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_data_generator >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_output_records_count(number_records)
        sdc_executor.stop_pipeline(pipeline)

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert data_from_bigquery == data_from_bigquery

        history = sdc_executor.get_pipeline_history(pipeline)
        records = history.latest.metrics.counter('pipeline.batchInputRecords.counter').count

        assert len(data_from_bigquery) == records
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_data_type(sdc_builder, sdc_executor, gcp, data_type, data,
                   expected_data):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using wiretap with one data format each.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name}'
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('id', 'INTEGER', mode='NULLABLE'),
        SchemaField('data', data_type, mode='NULLABLE')
    ]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, [(None, data)])
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(
            wiretap.output_records
        ) == 1, f'Expected 1 record, received {len(wiretap.output_records)}'
        received_data = wiretap.output_records[0].field['data']

        assert received_data == expected_data
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 7
0
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp):
    """Verify proper batch handling by the BigQuery origin.

    In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of
    size 3 and that 8 total records are captured. The small numbers are used because of the limitations
    described in SDC-8765.
    """
    MAX_BATCH_SIZE = 3
    TOTAL_RECORDS = 8

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number'

    pipeline_builder = sdc_builder.get_pipeline_builder()
    google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin')
    google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE)
    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('number', 'STRING', mode='required'),
              SchemaField('value', 'STRING', mode='required')]

    data = [dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS)]
    dataset = Dataset(bigquery_client.dataset(dataset_name))

    try:
        # Using Google bigquery client, create dataset, table and data inside table.
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset)
        table = bigquery_client.create_table(Table(dataset.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, data)
        assert not errors

        # Ceiling division is needed to capture all the complete batches along with the last partial one.
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True,
                                                 batches=math.ceil(TOTAL_RECORDS / MAX_BATCH_SIZE)).snapshot

        # Assert that the batch size is being respected in each batch (including the last). In this case,
        # we'd expect batch sizes of 3, 3, and 2.
        for i, batch in enumerate(snapshot.snapshot_batches, start=1):
            #  for 8 records, we'd expect batch sizes of 3, 3, and 2.
            assert (len(batch.stage_outputs[google_bigquery.instance_name].output) == MAX_BATCH_SIZE
                    if i * MAX_BATCH_SIZE <= TOTAL_RECORDS
                    else TOTAL_RECORDS % MAX_BATCH_SIZE)

        all_records = [record.field
                       for batch in snapshot.snapshot_batches
                       for record in batch.stage_outputs[google_bigquery.instance_name].output]
        assert all_records == data
    finally:
        bigquery_client.delete_dataset(dataset, delete_contents=True)
Ejemplo n.º 8
0
def test_google_bigquery_destination(sdc_builder, sdc_executor, gcp):
    """
    Send data to Google BigQuery from Dev Raw Data Source and
    confirm that Google BigQuery destination successfully recieves them using Google BigQuery client.

    This is achieved by using a deduplicator which assures that there is only one ingest to Google BigQuery.
    The pipeline looks like:
        dev_raw_data_source >> record_deduplicator >> google_bigquery
                               record_deduplicator >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       raw_data='\n'.join(CSV_DATA_TO_INSERT))

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    record_deduplicator = pipeline_builder.add_stage('Record Deduplicator')
    trash = pipeline_builder.add_stage('Trash')

    dev_raw_data_source >> record_deduplicator >> google_bigquery
    record_deduplicator >> trash

    pipeline = pipeline_builder.build(title='Google BigQuery Destination').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('full_name', 'STRING', mode='required'),
              SchemaField('age', 'INTEGER', mode='required')]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

    try:
        logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        logger.info('Starting BigQuery Destination pipeline and waiting for it to produce records ...')
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1)

        logger.info('Stopping BigQuery Destination pipeline and getting the count of records produced in total ...')
        sdc_executor.stop_pipeline(pipeline)

        # Verify by reading records using Google BigQuery client
        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()
        logger.debug('read_data = {}'.format(data_from_bigquery))
        assert ROWS_EXPECTED == data_from_bigquery
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 9
0
def test_object_names_columns(sdc_builder, sdc_executor, gcp, column_name):
    """
    Create data using Google BigQuery destination with different column names
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    data_to_insert = [(1, 'data')]
    rows_to_insert = [f'id,{column_name}'] + [','.join(str(element) for element in row) for row in data_to_insert]
    data_to_expect = [(1, 'data')]
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(rows_to_insert))

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)

    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_raw_data_source >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'),
              SchemaField(column_name, 'STRING', mode='NULLABLE')]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert data_to_expect == data_from_bigquery
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 10
0
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using snapshot.

    The pipeline looks like:
        google_bigquery >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY full_name'
    google_bigquery.set_attributes(query=query_str)

    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash
    pipeline = pipeline_builder.build(
        title='Google BigQuery').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('full_name', 'STRING', mode='required'),
        SchemaField('age', 'INTEGER', mode='required')
    ]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))
    try:
        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert errors == []

        # Start pipeline and verify correct rows are received using snaphot.
        logger.info('Starting pipeline and snapshot')
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot
        if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
        rows_from_snapshot = [(record.value['value'][0]['value'],
                               int(record.value['value'][1]['value']))
                              for record in snapshot[google_bigquery].output]

        assert rows_from_snapshot == ROWS_TO_INSERT
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 11
0
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name):
    """
    Create data using Google BigQuery destination with different dataset names
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(CSV_DATA_TO_INSERT))

    table_name = get_random_string(string.ascii_letters, 5)
    # If tests fail for any reason, leftovers with equal names might lead to more errors
    dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}'

    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_raw_data_source >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert ROWS_EXPECTED == data_from_bigquery
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name):
    """
    Create data using Google BigQuery client with specific table names
    and then check if Google BigQuery origin receives them using wiretap.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    table_name = get_random_string(string.ascii_letters, 5)
    dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}'
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f"SELECT * FROM `{dataset_name}`.`{table_name}` ORDER BY id"
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=SCHEMA))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\
            f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}'
        rows_from_wiretap = get_rows_from_wiretap(wiretap)
        assert rows_from_wiretap == ROWS_TO_INSERT
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 13
0
def load_stage(dst_dataset: Dataset, bq_client: Client, bucket_name: str,
               gcs_client: storage.Client) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: reference to destination dataset object
    :param bq_client: a BigQuery client object
    :param bucket_name: the location in GCS containing the vocabulary files
    :param gcs_client: a Cloud Storage client object
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name))

    table_blobs = [_filename_to_table_name(blob.name) for blob in blobs]
    missing_blobs = [
        table for table in VOCABULARY_TABLES if table not in table_blobs
    ]
    if missing_blobs:
        raise RuntimeError(
            f'Bucket {bucket_name} is missing files for tables {missing_blobs}'
        )

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        # ignore any non-vocabulary files
        if table_name not in VOCABULARY_TABLES:
            continue
        destination = dst_dataset.table(table_name)
        safe_schema = safe_schema_for(table_name)
        job_config = LoadJobConfig()
        job_config.schema = safe_schema
        job_config.skip_leading_rows = 1
        job_config.field_delimiter = FIELD_DELIMITER
        job_config.max_bad_records = MAX_BAD_RECORDS
        job_config.source_format = 'CSV'
        job_config.quote_character = ''
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(source_uri,
                                                 destination,
                                                 job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs
Ejemplo n.º 14
0
def load_tables(bq: bigquery.Client, dataset: bigquery.Dataset,
                tables: Iterable[Table]):
    """Load tables for a test."""
    for table in tables:
        destination = dataset.table(table.name)
        job_config = bigquery.LoadJobConfig(
            source_format=table.source_format,
            write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        )

        if table.schema is None:
            # autodetect schema if not provided
            job_config.autodetect = True
        else:
            job_config.schema = table.schema
            # look for time_partitioning_field in provided schema
            for field in job_config.schema:
                if field.description == "time_partitioning_field":
                    job_config.time_partitioning = bigquery.TimePartitioning(
                        field=field.name)
                    break  # stop because there can only be one time partitioning field

        if isinstance(table.source_path, str):
            with open(table.source_path, "rb") as file_obj:
                job = bq.load_table_from_file(file_obj,
                                              destination,
                                              job_config=job_config)
        else:
            file_obj = BytesIO()
            for row in load(*table.source_path):
                file_obj.write(
                    json.dumps(row, default=default_encoding).encode() + b"\n")
            file_obj.seek(0)
            job = bq.load_table_from_file(file_obj,
                                          destination,
                                          job_config=job_config)

        try:
            job.result()
        except BadRequest:
            # print the first 5 rows for debugging errors
            for row in job.errors[:5]:
                print(row)
            raise
Ejemplo n.º 15
0
def create_bq_table(
    client: bigquery.Client,
    dataset: bigquery.Dataset,
    table_id: str,
    table_schema: List[bigquery.SchemaField],
    table_description: str = None,
) -> bigquery.Table:
    """
    Create empty table.
    """
    # TODO: validate 'table_id'.

    # note: it is not intuitive the dual instantiation of a 'Table' object.
    table = bigquery.Table(dataset.table(table_id),
                           schema=table_schema)  # type: bigquery.Table
    table.description = table_description

    # API request
    return client.create_table(table)  # type: bigquery.Table
Ejemplo n.º 16
0
def persist_lines_stream(
    project_id,
    dataset_id,
    lines=None,
    validate_records=True,
    key_path=None,
):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    if key_path:
        credentials = service_account.Credentials.from_service_account_file(
            key_path,
            scopes=['https://www.googleapis.com/auth/cloud-platform'],
        )
        bigquery_client = bigquery.Client(credentials=credentials,
                                          project=project_id)
    else:
        bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(
            Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error('Unable to parse:\n{}'.format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    'A record for stream {} was encountered before a corresponding schema'
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            errors[msg.stream] = bigquery_client.insert_rows_json(
                tables[msg.stream], [msg.record])
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table),
                                           schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception('Unrecognized message {}'.format(msg))

    for table in errors.keys():
        if not errors[table]:
            logging.info('Loaded {} row(s) into {}:{}'.format(
                rows[table], dataset_id, table, tables[table].path))
            emit_state(state)
        else:
            logging.error(errors[table])

    return state
Ejemplo n.º 17
0
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives the data.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id'
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination
    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('id', 'INTEGER', mode='NULLABLE'),
        SchemaField('name', 'STRING', mode='NULLABLE'),
        SchemaField('floatVal', 'FLOAT', mode='NULLABLE'),
        SchemaField('numericVal', 'NUMERIC', mode='NULLABLE'),
        SchemaField('booleanVal', 'BOOLEAN', mode='NULLABLE'),
        SchemaField('dateVal', 'DATE', mode='NULLABLE'),
        SchemaField('datetimeVal', 'DATETIME', mode='NULLABLE'),
        SchemaField('timestampVal', 'TIMESTAMP', mode='NULLABLE'),
        SchemaField('timeVal', 'TIME', mode='NULLABLE'),
        SchemaField('bytesVal', 'BYTES', mode='NULLABLE')
    ]

    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

    try:
        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert not errors

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        get_value_by_index = lambda x: list(x[0].field.values())[x[1]].value
        rows_from_wiretap = [
            (int(get_value_by_index((record, 0))) if get_value_by_index(
                (record, 0)) is not None else None,
             get_value_by_index(
                 (record, 1)), float(get_value_by_index(
                     (record, 2))) if get_value_by_index(
                         (record, 2)) is not None else None,
             str(get_value_by_index((record, 3))) if get_value_by_index(
                 (record, 3)) is not None else None,
             get_value_by_index((record, 4)), get_value_by_index(
                 (record, 5)).strftime("%Y-%m-%d") if get_value_by_index(
                     (record, 5)) is not None else None,
             get_value_by_index(
                 (record, 6)).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
             if get_value_by_index(
                 (record, 6)) is not None else None, '{} UTC'.format(
                     get_value_by_index(
                         (record, 7)).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3])
             if get_value_by_index((record, 7)) is not None else None,
             get_value_by_index(
                 (record,
                  8)).strftime('%H:%M:%S.%f')[:-3] if get_value_by_index(
                      (record, 8)) is not None else None,
             get_value_by_index((record, 9)) if get_value_by_index(
                 (record, 9)) is not None else None)
            for record in wiretap.output_records
        ]

        assert rows_from_wiretap == ROWS_TO_INSERT
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 18
0
def persist_lines_stream(project_id, dataset_id, ensure_ascii, lines=None, validate_records=True, array_nodes=[],
                         force_to_string_fields=[]):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = collections.defaultdict(list)
    data_holder = []
    lines_read = False
    stream = None

    if flags.no_records:
        no_records = int(flags.no_records)
    else:
        logger.info('Number of records not specified. Setting to maximum: {}'.format(MAX_NO_RECORDS))
        no_records = MAX_NO_RECORDS

    if flags.data_location:
        bigquery_client = bigquery.Client(project=project_id, location=flags.data_location)
    else:
        bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    payload_size = 0
    for line in lines:
        lines_read = True
        # skip SCHEMA messages (except for the the intial one)
        if '{"anyOf": [{' in line:
            continue
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            if is_record_deleted(msg.record, SDC_DELETED_AT):
                continue

            if validate_records:
                validate(msg.record, schema)

            modified_record = handle_decimal_values(msg.record)
            modified_record = handle_empty_arrays(array_nodes, modified_record)
            modified_record = force_fields_to_string(force_to_string_fields, modified_record, ensure_ascii)

            item_size = getsize(modified_record)
            if payload_size + item_size >= MAX_PAYLOAD_SIZE:
                logger.info('Near max request size. Sending: {} records, payload size: {}.'.format(len(data_holder),
                                                                                                   payload_size))
                upload_res = bigquery_client.insert_rows_json(tables[msg.stream], data_holder)
                if upload_res:
                    logger.error('Upload error: {}'.format(upload_res))
                else:
                    rows[msg.stream] += len(data_holder)
                data_holder = []
                payload_size = 0
                data_holder.append(modified_record)
                payload_size += item_size
            else:
                if len(data_holder) >= no_records:
                    logger.info(
                        "Max request size not reached, max #records reached. Sending: {} records, payload size: {} bytes.".format(
                            len(data_holder), item_size + payload_size))
                    upload_res = bigquery_client.insert_rows_json(tables[msg.stream], data_holder)
                    if upload_res:
                        logger.error('Upload error: {}'.format(upload_res))
                    else:
                        rows[msg.stream] += len(data_holder)
                    data_holder = []
                    payload_size = 0
                data_holder.append(modified_record)
                payload_size += item_size

            stream = msg.stream

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = 0
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    if len(data_holder) > 0 and lines_read and stream:
        logger.info(
            "Remaining records. Sending: {} records, payload size: {} bytes.".format(len(data_holder), payload_size))
        upload_res = bigquery_client.insert_rows_json(tables[stream], data_holder)
        if upload_res:
            logger.error('Upload error: {}'.format(upload_res))
        else:
            rows[stream] += len(data_holder)

    for table in errors.keys():
        if not errors[table]:
            logging.info('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table, tables[table].path))
            emit_state(state)
        else:
            logging.error('Errors:', errors[table])

    return state
Ejemplo n.º 19
0
def test_google_bigquery_destination_multiple_types(sdc_builder, sdc_executor,
                                                    gcp):
    """Simple big query destination test with INSERT operation.
    The pipeline inserts 1000 records of multiple types.
    A type converter is included to transform decimal to float.
    The pipeline should look like:
        dev_data_generator >> field_type_converter >> [google_bigquery, wiretap.destination]
    """

    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_data_generator = pipeline_builder.add_stage('Dev Data Generator')
    dev_data_generator.fields_to_generate = [{
        'field': 'field1',
        'type': 'STRING'
    }, {
        'field': 'field2',
        'type': 'DATETIME'
    }, {
        'field': 'field3',
        'type': 'INTEGER'
    }, {
        'field': 'field4',
        'precision': 10,
        'scale': 2,
        'type': 'DECIMAL'
    }, {
        'field': 'field5',
        'type': 'DOUBLE'
    }]
    batch_size = 1000
    dev_data_generator.set_attributes(delay_between_batches=0,
                                      batch_size=batch_size)

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name,
                                   stage_on_record_error='TO_ERROR')

    # Create Field Type Converter
    conversions = [{'fields': ['/field4'], 'targetType': 'FLOAT'}]
    field_type_converter = pipeline_builder.add_stage('Field Type Converter')
    field_type_converter.set_attributes(
        conversion_method='BY_FIELD', field_type_converter_configs=conversions)

    wiretap = pipeline_builder.add_wiretap()

    dev_data_generator >> field_type_converter >> [
        google_bigquery, wiretap.destination
    ]

    pipeline = pipeline_builder.build()

    sdc_executor.add_pipeline(pipeline.configure_for_environment(gcp))

    # FLOAT64 is used because there is a bug with NUMERIC, in bigquery Client
    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('field1', 'STRING', mode='required'),
        SchemaField('field2', 'DATETIME', mode='required'),
        SchemaField('field3', 'INTEGER', mode='required'),
        SchemaField('field4', 'FLOAT64', mode='required'),
        SchemaField('field5', 'FLOAT', mode='required')
    ]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

    try:
        logger.info('Creating dataset %s using Google BigQuery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))

        logger.info(
            'Starting BigQuery Destination pipeline and waiting for it to produce records ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline,
                                              'input_record_count',
                                              1_000,
                                              timeout_sec=60)
        sdc_executor.stop_pipeline(pipeline)

        # Verify by reading records using Google BigQuery client
        data_from_bigquery = [row for row in bigquery_client.list_rows(table)]
        assert len(data_from_bigquery) > batch_size
        assert len(wiretap.error_records) == 0

        assert len(data_from_bigquery) == len(wiretap.output_records)
        assert [
            element in data_from_bigquery for element in wiretap.output_records
        ]
    finally:
        logger.info('Dropping table %s in Google Big Query database ...',
                    table_name)
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 20
0
def test_data_type(sdc_builder, sdc_executor, gcp, origin_data_type, gcp_data_type, origin_data, gcp_expected_data):
    """
    Create data using Google BigQuery destination with different data types
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
    dev_raw_data_source >> field_type_converter >> google_bigquery
    """
    data_to_insert = [(1, origin_data)]
    data_to_expect = [(1, gcp_expected_data)]

    rows_to_insert = ['id,data'] + [','.join(str(element) for element in row) for row in data_to_insert]

    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(rows_to_insert))

    field_type_converter = pipeline_builder.add_stage('Field Type Converter')
    field_type_converter.conversion_method = 'BY_FIELD'
    field_type_converter.field_type_converter_configs = [{
        'fields': ['/data'],
        'targetType': origin_data_type,
        'dataLocale': 'en,US',
        'dateFormat': 'YYYY_MM_DD_HH_MM_SS',
        'zonedDateTimeFormat': 'ISO_OFFSET_DATE_TIME',
        'scale': 38
    }]

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)

    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_raw_data_source >> field_type_converter >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    # We create data type for gcp, and insert it against that type to check if it works
    schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'),
              SchemaField('data', gcp_data_type, mode='NULLABLE')]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert data_to_expect == data_from_bigquery
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_multiple_batch(sdc_builder, sdc_executor, gcp, number_batches):
    """
    Create data using Google BigQuery client with specific column names
    and then check if Google BigQuery origin receives them using wiretap.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    number_records = 100
    batch_size = number_records // number_batches

    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    google_bigquery.set_attributes(max_batch_size_in_records=batch_size)
    query_str = f"SELECT * FROM {dataset_name}.{table_name} ORDER BY id"
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('id', 'INTEGER', mode='NULLABLE'),
        SchemaField('data', 'STRING', mode='NULLABLE')
    ]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        input_values = []
        for i in range(0, number_records):
            input_values += [(i, get_random_string(string.ascii_letters, 5))]
            errors = bigquery_client.insert_rows(
                table, [(i, get_random_string(string.ascii_letters, 5))])
            assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(wiretap.output_records) == number_records,\
            f'Expected {number_records} record, received {len(wiretap.output_records)}'
        records = [record.field for record in wiretap.output_records]
        assert len(records) >= number_batches

        batches = wiretap.batches
        len_records_in_batches = 0
        for bat in batches:
            len_records_in_batches = len_records_in_batches + len(bat)
        assert len(records) == len_records_in_batches
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using snapshot.

    The pipeline looks like:
        google_bigquery >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id'
    google_bigquery.set_attributes(query=query_str)

    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash
    pipeline = pipeline_builder.build(
        title='Google BigQuery').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('id', 'INTEGER', mode='NULLABLE'),
        SchemaField('name', 'STRING', mode='NULLABLE'),
        SchemaField('floatVal', 'FLOAT', mode='NULLABLE'),
        SchemaField('numericVal', 'NUMERIC', mode='NULLABLE'),
        SchemaField('booleanVal', 'BOOLEAN', mode='NULLABLE'),
        SchemaField('dateVal', 'DATE', mode='NULLABLE'),
        SchemaField('datetimeVal', 'DATETIME', mode='NULLABLE'),
        SchemaField('timestampVal', 'TIMESTAMP', mode='NULLABLE'),
        SchemaField('timeVal', 'TIME', mode='NULLABLE'),
        SchemaField('bytesVal', 'BYTES', mode='NULLABLE')
    ]

    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

    try:
        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert not errors

        # Start pipeline and verify correct rows are received using snaphot.
        logger.info('Starting pipeline and snapshot')
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot
        if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
        rows_from_snapshot = [
            (int(record.value['value'][0]['value'])
             if record.value['value'][0]['value'] is not None else None,
             record.value['value'][1]['value'],
             float(record.value['value'][2]['value'])
             if record.value['value'][2]['value'] is not None else None,
             record.value['value'][3]['value'],
             record.value['value'][4]['value'],
             time.strftime(
                 '%Y-%m-%d',
                 time.localtime(record.value['value'][5]['value'] / 1000))
             if record.value['value'][5]['value'] is not None else None,
             datetime.fromtimestamp(record.value['value'][6]['value'] /
                                    1000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
             if record.value['value'][6]['value'] is not None else None,
             '{} UTC'.format(
                 datetime.utcfromtimestamp(
                     record.value['value'][7]['value'] /
                     1000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3])
             if record.value['value'][7]['value'] is not None else None,
             datetime.fromtimestamp(record.value['value'][8]['value'] /
                                    1000).strftime('%H:%M:%S.%f')[:-3]
             if record.value['value'][8]['value'] is not None else None,
             base64.b64decode(record.value['value'][9]['value'])
             if record.value['value'][9]['value'] is not None else None)
            for record in snapshot[google_bigquery].output
        ]

        assert rows_from_snapshot == ROWS_TO_INSERT
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Ejemplo n.º 23
0
def persist_lines_stream(config, lines=None, validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    # bigquery_client = bigquery.Client(project=project_id)
    service = service_account.Credentials.from_service_account_file(
        config['key_file_location'])

    bigquery_client = bigquery.Client(project=config['project_id'],
                                      credentials=service)

    dataset_ref = bigquery_client.dataset(config['dataset_id'])
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(
            Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            js = json.loads(line)
            # msg = singer.parse_message(line)
            if js['type'] == 'RECORD':
                msg = singer.messages.RecordMessage(stream=js.get('stream'),
                                                    record=js.get('record'),
                                                    version=js.get('version'),
                                                    time_extracted=None)
            else:
                msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            errors[msg.stream] = bigquery_client.insert_rows_json(
                tables[msg.stream], [msg.record])
            rows[msg.stream] += 1

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table),
                                           schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in errors.keys():
        if not errors[table]:
            logger.info('Loaded {} row(s) into {}:{}'.format(
                rows[table], config['dataset_id'], table, tables[table].path))
            emit_state(state)
        else:
            logger.error('Errors:', errors[table])

    return state
Ejemplo n.º 24
0
def persist_lines_stream(project_id,
                         dataset_id,
                         credentials=None,
                         lines=None,
                         validate_records=True,
                         collision_suffix=None,
                         current_batch=None):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id,
                                      credentials=credentials)
    buffered_singer_stream = BufferedSingerStream(bigquery_client)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(
            Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]
            current_time = calendar.timegm(time.gmtime())

            if validate_records:
                try:
                    validate(msg.record, schema)
                except:
                    logger.error("unable to validate {}".format(msg.record))

            # logger.info("Got the following schema - {}".format(schema))
            # logger.info("got the following metadata - {}".format(key_properties[msg.stream]))

            j = simplejson.dumps(msg.record)
            jparsed = simplejson.loads(j, use_decimal=False)
            jparsedFormated = formatRecord(jparsed)

            if msg.version:
                jparsedFormated[SINGER_TABLE_VERSION] = str(msg.version)

            if msg.time_extracted and jparsedFormated.get(
                    SINGER_RECEIVED_AT) is None:
                jparsedFormated[SINGER_RECEIVED_AT] = str(msg.time_extracted)

            # if self.use_uuid_pk and record.get(singer.PK) is None:
            jparsedFormated[SINGER_PK] = buildPrimaryKey(
                jparsedFormated, key_properties[msg.stream])
            # logger.info("got the following primary key - {}".format(jparsedFormated[SINGER_PK]))

            jparsedFormated[SINGER_BATCHED_AT] = current_batch

            jparsedFormated[SINGER_SEQUENCE] = current_time

            # logger.info("Streaming for {}".format(jparsedFormated))
            buffered_singer_stream.add_record_message(tables[msg.stream],
                                                      jparsedFormated)
            # err = bigquery_client.insert_rows_json(tables[msg.stream], [jparsedFormated], ignore_unknown_values=True, skip_invalid_rows=False)
            # if len(err):
            #     logger.error("Error syncing object {} with formatted payload {} got the following errors {}".format(msg.stream, jparsedFormated, err))
            # errors[msg.stream] = err
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream

            # Some schema are coming without pre`operties, just ignore them
            if not 'properties' in msg.schema:
                continue

            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            logger.info("Dealing with {}".format(table))
            logger.info("Table Schema Info - {}".format(dataset.table(table)))
            logger.info("Raw Schema Info - {}".format(schemas[table]))
            logger.info("Schema Info - {}".format(build_schema(
                schemas[table])))
            tables[table] = bigquery.Table(dataset.table(
                formatName(table, "table")),
                                           schema=build_schema(schemas[table],
                                                               initial=True))
            rows[table] = 0
            errors[table] = None
            try:
                logger.info("creating table {}".format(tables[table]))
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    buffered_singer_stream.flush_buffer()

    for table in errors.keys():
        if not errors[table]:
            logging.info("Loaded {} row(s) into {}:{}".format(
                rows[table], dataset_id, table, tables[table].path))
            tableName = "{}.{}".format(dataset_id, table)
            sql = """
MERGE {} t
USING (
  SELECT row[OFFSET(0)].* FROM (
    SELECT ARRAY_AGG(t ORDER BY t._wly_batched_at DESC LIMIT 1) row
    FROM {} t
    GROUP BY t._wly_primary_key 
  ) 
) s
ON t._wly_primary_key = s._wly_primary_key and t._wly_batched_at = s._wly_batched_at
WHEN NOT MATCHED BY SOURCE THEN DELETE
WHEN NOT MATCHED BY TARGET THEN INSERT ROW
            """.format(tableName, tableName)
            logging.info(
                'Executing the final transformation into {}'.format(tableName))
            try:
                query_job = bigquery_client.query(sql)
                results = query_job.result()
                if results:
                    logger.info(
                        "Cleaning table {} with sql query {} got the following results {}"
                        .format(tableName, sql, results))
            except:
                logger.error("Failed Cleaning the Table {}".format(tableName))

            emit_state(state)
        else:
            logging.error('Errors: {}'.format(errors[table]))

    return state
Ejemplo n.º 25
0
def persist_lines_stream(project_id, dataset_id, lines=None):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception("A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            validate(msg.record, schema)

            errors[msg.stream] = bigquery_client.create_rows(tables[msg.stream], [msg.record])
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream 
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass
        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in errors.keys():
        if not errors[table]:
            print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path)
        else:
            print('Errors:', errors[table], sep=" ")

    return state
Ejemplo n.º 26
0
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp):
    """Verify proper batch handling by the BigQuery origin.

    In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of
    size 3 and that 8 total records are captured. The small numbers are used because of the limitations
    described in SDC-8765.
    """
    MAX_BATCH_SIZE = 3
    TOTAL_RECORDS = 8

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number'

    pipeline_builder = sdc_builder.get_pipeline_builder()
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    google_bigquery.set_attributes(query=query_str,
                                   max_batch_size_in_records=MAX_BATCH_SIZE)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('number', 'STRING', mode='required'),
        SchemaField('value', 'STRING', mode='required')
    ]

    data = [
        dict(number=str(i), value=get_random_string())
        for i in range(TOTAL_RECORDS)
    ]
    dataset = Dataset(bigquery_client.dataset(dataset_name))

    try:
        # Using Google bigquery client, create dataset, table and data inside table.
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset)
        table = bigquery_client.create_table(
            Table(dataset.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, data)
        assert not errors

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        history = sdc_executor.get_pipeline_history(pipeline)

        # Assert that the batch size is being respected in each batch (including the last). In this case,
        # we'd expect batch sizes of 3, 3, and 2.
        assert history.latest.metrics.counter(
            'pipeline.batchCount.counter').count == 3
        assert history.latest.metrics.histogram(
            'stage.GoogleBigQuery_01.outputRecords.histogramM5'
        )._data['count'] == 3
        assert history.latest.metrics.histogram(
            'stage.GoogleBigQuery_01.outputRecords.histogramM5'
        )._data['max'] == 3
        assert history.latest.metrics.histogram(
            'stage.GoogleBigQuery_01.outputRecords.histogramM5'
        )._data['min'] == 2

        records = [record.field for record in wiretap.output_records]
        assert len(records) == len(data)
        assert records == data
    finally:
        bigquery_client.delete_dataset(dataset, delete_contents=True)
Ejemplo n.º 27
0
class LoadVocabTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        print('**************************************************************')
        print(cls.__name__)
        print('**************************************************************')

    def setUp(self):
        project_id = 'fake_project_id'
        dataset_id = 'fake_dataset_id'
        bucket_name = 'fake_bucket'
        dataset_ref = DatasetReference(project_id, dataset_id)
        self.dst_dataset = Dataset(dataset_ref)
        self.bq_client = mock.MagicMock()
        self.gcs_client = mock.MagicMock()
        self.bucket_name = bucket_name
        self.all_blobs = [
            Blob(f'{table}.csv', self.bucket_name)
            for table in common.VOCABULARY_TABLES
        ]

    def test_get_release_date(self):
        release_date = datetime.date(2021, 2, 10)
        expected = '20210210'
        actual = load_vocab.get_release_date(release_date)
        self.assertEqual(expected, actual)

    def test_load_stage(self):
        # TODO check that extra files are skipped

        # the expected calls to load_table_from_uri
        # are made when all vocabulary files are present
        all_blobs = [
            Blob(f'{table}.csv', self.bucket_name)
            for table in common.VOCABULARY_TABLES
        ]
        self.gcs_client.list_blobs.return_value = all_blobs
        load_vocab.load_stage(self.dst_dataset, self.bq_client,
                              self.bucket_name, self.gcs_client)
        mock_ltfu = self.bq_client.load_table_from_uri
        expected_calls = [(f'gs://{self.bucket_name}/{table}.csv',
                           self.dst_dataset.table(table))
                          for table in common.VOCABULARY_TABLES]
        actual_calls = [(source_uri, destination)
                        for (source_uri,
                             destination), _ in mock_ltfu.call_args_list]
        self.assertListEqual(expected_calls, actual_calls)

        # error is thrown when vocabulary files are missing
        expected_missing = [common.DOMAIN, common.CONCEPT_SYNONYM]
        incomplete_blobs = [
            Blob(f'{table}.csv', self.bucket_name)
            for table in common.VOCABULARY_TABLES
            if table not in expected_missing
        ]
        self.gcs_client.list_blobs.return_value = incomplete_blobs
        expected_msg = f'Bucket {self.bucket_name} is missing files for tables {expected_missing}'
        with self.assertRaises(RuntimeError) as c:
            load_vocab.load_stage(self.dst_dataset, self.bq_client,
                                  self.bucket_name, self.gcs_client)
            self.assertIsInstance(c.exception, RuntimeError)
            self.assertEqual(str(c.exception), expected_msg)
Ejemplo n.º 28
0
def persist_lines_stream(project_id, dataset_id, lines=None, validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            msg.record = apply_string_conversions(msg.record, tables[msg.stream].schema, msg.stream)
            errors[msg.stream] = bigquery_client.insert_rows_json(tables[msg.stream], [msg.record],
                                                                  ignore_unknown_values=True)
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in errors.keys():
        if not errors[table]:
            logging.info('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table, tables[table].path))
            emit_state(state)
        else:
            logging.error('Errors:', errors[table])

    return state
Ejemplo n.º 29
0
def persist_lines(project_id, dataset_id, table_id, lines):
    state = None
    schemas = {}
    key_properties = {}

    rows = []

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]
            validate(msg.record, schema)

            bigquery_client = bigquery.Client(project=project_id)

            dataset_ref = bigquery_client.dataset(dataset_id)
            dataset = Dataset(dataset_ref)

            try:
                dataset = bigquery_client.create_dataset(
                    Dataset(dataset_ref)) or Dataset(dataset_ref)
            except exceptions.Conflict:
                pass

            table_ref = dataset.table(table_id)
            table_schema = build_schema(schema)

            table = bigquery.Table(table_ref, schema=table_schema)
            try:
                table = bigquery_client.create_table(table)
            except exceptions.Conflict:
                pass

            rows.append(msg.record)

            state = None
        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value
        elif isinstance(msg, singer.SchemaMessage):
            schemas[msg.stream] = msg.schema
            key_properties[msg.stream] = msg.key_properties
        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass
        else:
            raise Exception("Unrecognized message {}".format(msg))

    errors = bigquery_client.create_rows(table, rows)

    if not errors:
        print('Loaded {} row(s) into {}:{}'.format(len(rows), dataset_id,
                                                   table_id))
    else:
        print('Errors:')
        pprint(errors)

    return state