def test_push_pull(sdc_builder, sdc_executor, gcp):
    """
    We plan to verify that the connector works fine with Dev Raw Data Source and Dev Data Generator, an example of pull
    and push strategies, so as we already verified Dev Raw Data Source, we will use Dev Data Generator here to complete
    the coverage.

    The pipeline looks like:
        dev_raw_data_source >> google_cloud_storage_destination
    """

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    bucket_name = "stf_std_" + get_random_string(string.ascii_lowercase, 20)

    dev_data_generator = pipeline_builder.add_stage('Dev Data Generator')

    dev_data_generator.set_attributes(batch_size=1,
                                      fields_to_generate=[{
                                          'field': 'stringField',
                                          'type': 'STRING',
                                          'precision': 10,
                                          'scale': 2
                                      }])

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        partition_prefix='test',
                                        data_format='JSON')

    dev_data_generator >> google_cloud_storage

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Test Push Pull'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        sdc_executor.start_pipeline(
            pipeline).wait_for_pipeline_output_records_count(20)
        sdc_executor.stop_pipeline(pipeline)

        history = sdc_executor.get_pipeline_history(pipeline)
        history_records = history.latest.metrics.counter(
            'stage.GoogleCloudStorage_01.outputRecords.counter').count

        blob_iter = created_bucket.list_blobs(prefix='gcs-test')
        blobs = [blob for blob in blob_iter]
        assert len(blobs) == history_records

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_log(sdc_builder, sdc_executor, gcp):
    """
    Read data from Google cloud storage using log format mode.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    MESSAGE = '200 [main] DEBUG org.StreamSets.Log4j unknown - This is a sample log message'
    EXPECTED_OUTPUT = {
        'category': 'org.StreamSets.Log4j',
        'message': 'This is a sample log message',
        'ndc': 'unknown',
        'relativetime': '200',
        'severity': 'DEBUG',
        'thread': 'main'
    }

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*.log',
                                        data_format='LOG',
                                        log_format='LOG4J')
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format Text'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        blob = created_bucket.blob('gcs-test/sdc-test.log')
        blob.upload_from_string(MESSAGE)

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              1)
        sdc_executor.stop_pipeline(pipeline)

        assert [record.field
                for record in wiretap.output_records] == [EXPECTED_OUTPUT]
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_bucket_names(sdc_builder, sdc_executor, gcp, test_name, bucket_name):
    """
    Write data to Google cloud storage with different valid bucket names.

    The pipeline looks like:
        dev_raw_data_source >> google_cloud_storage_destination
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    data = [
        get_random_string(string.ascii_letters, length=100) for _ in range(10)
    ]

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')

    dev_raw_data_source.set_attributes(data_format='TEXT',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(data))

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        partition_prefix='test',
                                        data_format='TEXT')

    dev_raw_data_source >> google_cloud_storage

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Bucket Names {test_name}'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        logger.info(
            'Starting GCS Destination pipeline and waiting for it to produce records'
            ' and transition to finished...')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        blob_iter = created_bucket.list_blobs(max_results=1, prefix='gcs-test')
        blobs = [blob for blob in blob_iter]
        assert len(blobs) == 1
        blob = blobs[0]
        # Decode the byte array returned by storage client
        contents = blob.download_as_string().decode('ascii')
        # Strip out the lines which are empty (essentially the last line)
        lines = [line for line in contents.split('\n') if len(line) > 0]
        assert lines == data

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_delimited(sdc_builder, sdc_executor, gcp):
    """
    Write data to Google cloud storage using Delimited format.

    The pipeline looks like:
        dev_raw_data_source >> google_cloud_storage_destination
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    bucket_name = "stf_std_" + get_random_string(string.ascii_lowercase, 20)

    raw_data = 'Alex,Xavi,Tucu,Martin'
    EXPECTED_OUTPUT = 'Alex,Xavi,Tucu,Martin\r'

    dev_raw_data_source = pipeline_builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='DELIMITED',
                                              raw_data=raw_data,
                                              stop_after_first_batch=True)

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        partition_prefix='test',
                                        data_format='DELIMITED')

    dev_raw_data_source >> google_cloud_storage

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Test Data Format Delimited'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        blob_iter = created_bucket.list_blobs(max_results=1, prefix='gcs-test')
        blobs = [blob for blob in blob_iter]
        assert len(blobs) == 1
        blob = blobs[0]
        # Decode the byte array returned by storage client
        contents = blob.download_as_string().decode('ascii')
        # Strip out the lines which are empty (essentially the last line)
        lines = [line for line in contents.split('\n') if len(line) > 0]
        assert len(lines) == 1
        assert lines[0] == EXPECTED_OUTPUT

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_protobuf(sdc_builder, sdc_executor, gcp):
    """
    Write data to Google cloud storage using Protobuf format.

    The pipeline looks like:
        dev_raw_data_source >> google_cloud_storage_destination
    """
    data = '{"first_name": "Martin","last_name": "Balzamo"}'
    expected = '\x11\x06Martin\x12\x07Balzamo'

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client
    bucket_name = "stf_std_" + get_random_string(string.ascii_lowercase, 20)

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')

    dev_raw_data_source.set_attributes(data_format='JSON',
                                       stop_after_first_batch=True,
                                       raw_data=data)

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')
    google_cloud_storage.set_attributes(
        bucket=bucket_name,
        common_prefix='gcs-test',
        data_format='PROTOBUF',
        message_type='Contact',
        protobuf_descriptor_file=PROTOBUF_FILE_PATH)

    dev_raw_data_source >> google_cloud_storage

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Data Format Protobuf'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        blob_iter = created_bucket.list_blobs(max_results=1, prefix='gcs-test')
        blobs = [blob for blob in blob_iter]
        assert len(blobs) == 1
        blob = blobs[0]
        # Decode the byte array returned by storage client
        contents = blob.download_as_string().decode('ascii')
        # Strip out the lines which are empty (essentially the last line)
        lines = [line for line in contents.split('\n') if len(line) > 0]
        result = ''.join(lines)
        assert result == expected
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_whole_file(sdc_builder, sdc_executor, gcp):
    """
    Read data from Google cloud storage using whole file format.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*.txt',
                                        data_format='WHOLE_FILE')
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format Whole File'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        data = [
            get_random_string(string.ascii_letters, 100) for _ in range(10)
        ]
        blob = created_bucket.blob('gcs-test/sdc-test.txt')
        blob.upload_from_string('\n'.join(data))

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              1)
        sdc_executor.stop_pipeline(pipeline)

        output_records = [record.field for record in wiretap.output_records]
        # Whole file mode only returns metadata, but no actual data, so we can only test that a single file was passed
        # and that it has the correct name
        assert len(output_records) == 1
        assert output_records[0]['fileInfo']['file'] == 'gcs-test/sdc-test.txt'

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_text(sdc_builder, sdc_executor, gcp):
    """
    Read data from Google cloud storage using text format.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*.txt',
                                        data_format='TEXT')
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format Text'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        data = [
            get_random_string(string.ascii_letters, 100) for _ in range(10)
        ]
        blob = created_bucket.blob('gcs-test/sdc-test.txt')
        blob.upload_from_string('\n'.join(data))

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              10)
        sdc_executor.stop_pipeline(pipeline)

        output_records = [
            record.field['text'] for record in wiretap.output_records
        ]

        assert len(data) == len(output_records)
        assert output_records == data
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_delimited(sdc_builder, sdc_executor, gcp, csv_parser):
    """
     Read data from Google cloud storage using delimited data format.

     The pipeline looks like:
         google_cloud_storage_origin >> wiretap
     """
    MESSAGE = 'Alex,Xavi,Tucu,Martin'
    EXPECTED_OUTPUT = {'0': 'Alex', '1': 'Xavi', '2': 'Tucu', '3': 'Martin'}

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*.del',
                                        data_format='DELIMITED',
                                        csv_parser=csv_parser)
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format Text'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        blob = created_bucket.blob('gcs-test/sdc-test.del')
        blob.upload_from_string(MESSAGE)

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              1)
        sdc_executor.stop_pipeline(pipeline)

        assert [record.field
                for record in wiretap.output_records] == [EXPECTED_OUTPUT]
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_empty_objects(sdc_builder, sdc_executor, gcp):
    """
    Test that no records are generated if the origin is empty.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*.txt',
                                        data_format='TEXT')
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Empty').configure_for_environment(
            gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        blob = created_bucket.blob('gcs-test/sdc-test.txt')

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)

        time.sleep(30)

        sdc_executor.stop_pipeline(pipeline)

        # Assert that no records were generated
        assert 0 == len(wiretap.output_records)

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
Example #10
0
def test_google_storage_error(sdc_builder, sdc_executor, gcp):
    """Ensure that the error stage for Google Storage works properly"""
    bucket_name = get_random_string(ascii_lowercase, 10)

    builder = sdc_builder.get_pipeline_builder()
    origin = builder.add_stage('Dev Raw Data Source')
    origin.data_format = 'TEXT'
    origin.stop_after_first_batch = True
    origin.raw_data = 'Hello!'

    error_target = builder.add_stage('To Error')

    origin >> error_target

    gcs = builder.add_error_stage('Write to Google Cloud Storage')
    gcs.bucket = bucket_name
    gcs.common_prefix = 'gcs-test-error'

    pipeline = builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    storage_client = gcp.storage_client
    created_bucket = gcp.retry_429(storage_client.create_bucket)(bucket_name)
    try:
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        blob_iter = created_bucket.list_blobs(max_results=1,
                                              prefix='gcs-test-error')
        blobs = [blob for blob in blob_iter]
        assert len(blobs) == 1
        blob = blobs[0]
        # Decode the byte array returned by storage client
        contents = blob.download_as_string()[1:].decode('ascii')
        logger.info(f"Loaded raw data: {contents}")
        sdc_json = json.loads(contents)

        assert sdc_json['value']['value']['text']['value'] == 'Hello!'
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_sdc_record(sdc_builder, sdc_executor, gcp):
    """
    Write data to Google cloud storage using SDC Record format.

    The pipeline looks like:
        dev_raw_data_source >> google_cloud_storage_destination
    """
    json_data = [{
        "field1": "abc",
        "field2": "def",
        "field3": "ghi"
    }, {
        "field1": "jkl",
        "field2": "mno",
        "field3": "pqr"
    }]
    raw_data = ''.join(json.dumps(record) for record in json_data)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    bucket_name = "stf_std_" + get_random_string(string.ascii_lowercase, 20)

    dev_raw_data_source = pipeline_builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=raw_data,
                                              stop_after_first_batch=True)

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        partition_prefix='test',
                                        data_format='SDC_JSON')

    dev_raw_data_source >> google_cloud_storage

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Test Data Format SDC Record'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        # To verify that the SDC Record format has been successfully stored, we read the data from GCS using an auxiliary
        # pipeline. This pipeline is set to read data in SDC Record format.
        wiretap_records = read_messages_SDC_record_gcp(bucket_name,
                                                       sdc_builder,
                                                       sdc_executor, gcp)

        # We compare the results read by the GCS Origin pipeline and check that the data is equal to the original data stored
        assert len(wiretap_records) == len(json_data)
        assert wiretap_records[0].field == json_data[0]
        assert wiretap_records[1].field == json_data[1]

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_excel(sdc_builder, sdc_executor, gcp):
    """
    Read data from Google cloud storage using Excel format.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    gcp_file_name = get_random_string(string.ascii_letters, 10)

    # Create the Excel file

    workbook = Workbook()
    sheet = workbook.add_sheet('0')

    colcount = 5
    rowcount = 10

    for col in range(colcount):
        for row in range(rowcount):
            sheet.write(row, col, 'TAB({row}, {col})'.format(row=row, col=col))

    workbook.save(gcp_file_name)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern=gcp_file_name,
                                        data_format='EXCEL',
                                        excel_header_option='NO_HEADER')
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format Excel'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        blob = created_bucket.blob('gcs-test/' + gcp_file_name)
        blob.upload_from_filename(gcp_file_name)

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              10)
        sdc_executor.stop_pipeline(pipeline)

        output_records = [record.field for record in wiretap.output_records]
        len_records = len(output_records)

        # Compare the results get from the output_records
        for row_res in range(len_records):
            for col_res in range(colcount):
                assert output_records[row_res][str(
                    col_res)] == "TAB({row}, {col})".format(row=row_res,
                                                            col=col_res)
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
        os.remove(gcp_file_name)
def test_data_format_avro(sdc_builder, sdc_executor, gcp):
    """
    Read data from Google cloud storage using Avro format.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    DATA = {
        'name': 'boss',
        'age': 60,
        'emails': ['*****@*****.**', '*****@*****.**'],
        'boss': None
    }
    SCHEMA = {
        'namespace':
        'example.avro',
        'type':
        'record',
        'name':
        'Employee',
        'fields': [{
            'name': 'name',
            'type': 'string'
        }, {
            'name': 'age',
            'type': 'int'
        }, {
            'name': 'emails',
            'type': {
                'type': 'array',
                'items': 'string'
            }
        }, {
            'name': 'boss',
            'type': ['Employee', 'null']
        }]
    }

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*.avro',
                                        data_format='AVRO',
                                        avro_schema=json.dumps(SCHEMA),
                                        avro_schema_location='SOURCE')
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format Avro'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        blob = created_bucket.blob('gcs-test/sdc-test.avro')

        binary_stream = io.BytesIO()
        datum_writer = avro.io.DatumWriter(
            avro.schema.Parse(json.dumps(SCHEMA)))
        with avro.datafile.DataFileWriter(
                writer=binary_stream,
                datum_writer=datum_writer,
                writer_schema=avro.schema.Parse(
                    json.dumps(SCHEMA))) as data_file_writer:
            data_file_writer.append(DATA)
            data_file_writer.flush()
            raw_bytes = binary_stream.getvalue()

        blob.upload_from_string(raw_bytes)

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              1)
        sdc_executor.stop_pipeline(pipeline)

        assert [record.field for record in wiretap.output_records] == [DATA]
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_avro(sdc_builder, sdc_executor, gcp):
    """
    Write data to Google cloud storage using Avro format.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    DATA = {
        'name': 'boss',
        'age': 60,
        'emails': ['*****@*****.**', '*****@*****.**'],
        'boss': None
    }
    SCHEMA = {
        'namespace':
        'example.avro',
        'type':
        'record',
        'name':
        'Employee',
        'fields': [{
            'name': 'name',
            'type': 'string'
        }, {
            'name': 'age',
            'type': 'int'
        }, {
            'name': 'emails',
            'type': {
                'type': 'array',
                'items': 'string'
            }
        }, {
            'name': 'boss',
            'type': ['Employee', 'null']
        }]
    }

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    bucket_name = "stf_std_" + get_random_string(string.ascii_lowercase, 20)

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')

    dev_raw_data_source.set_attributes(data_format='JSON',
                                       stop_after_first_batch=True,
                                       raw_data=json.dumps(DATA))

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        partition_prefix='test',
                                        data_format='AVRO',
                                        avro_schema=json.dumps(SCHEMA),
                                        avro_schema_location='INLINE')

    dev_raw_data_source >> google_cloud_storage

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Data Format Avro'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        # To verify that the Avro format has been successfully stored, we read the data from GCS using an auxiliary
        # pipeline. This pipeline is set to read data in Avro format.
        result = read_avro_data(bucket_name, sdc_builder, sdc_executor, gcp)

        # We compare the results read by the GCS Origin pipeline and check that the data is equal to the original data stored
        assert [record.field for record in result] == [DATA]
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_json(sdc_builder, sdc_executor, gcp, data_type):
    """
    Read data from Google cloud storage in JSON format.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    # We map data_type to input data as well as the expected output.
    DATA_TYPE = {
        'ARRAY': ['Alex', 'Xavi'],
        'ARRAY_OF_OBJECTS': [{
            'Alex': 'Developer'
        }, {
            'Xavi': 'Developer'
        }],
        'OBJECT': {
            'Alex': 'Developer',
            'Xavi': 'Developer'
        }
    }
    data, expected_output = json.dumps(
        DATA_TYPE[data_type]), DATA_TYPE[data_type]

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*.json',
                                        data_format='JSON')
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format JSON'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        blob = created_bucket.blob('gcs-test/sdc-test.json')
        blob.upload_from_string(data)

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              1)
        sdc_executor.stop_pipeline(pipeline)

        assert [record.field
                for record in wiretap.output_records] == [expected_output]
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_dataflow_events(sdc_builder, sdc_executor, gcp):
    """
    Write data to Google cloud storage, capture events on wiretap and verify their content

    The pipeline looks like:
        dev_raw_data_source >> google_cloud_storage_origin >= wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    bucket_name = "stf_std_" + get_random_string(string.ascii_lowercase, 20)

    data = [
        get_random_string(string.ascii_letters, length=100) for _ in range(10)
    ]

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')

    dev_raw_data_source.set_attributes(data_format='TEXT',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(data))

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')

    google_cloud_storage.set_attributes(
        bucket=bucket_name,
        common_prefix='gcs-test',
        partition_prefix='${YYYY()}/${MM()}/${DD()}/${hh()}/${mm()}',
        data_format='TEXT')

    wiretap = pipeline_builder.add_wiretap()

    dev_raw_data_source >> google_cloud_storage >= wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Dataflow Events'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        blob_iter = created_bucket.list_blobs(max_results=1, prefix='gcs-test')
        blobs = [blob for blob in blob_iter]
        assert len(blobs) == 1
        blob = blobs[0]
        # Decode the byte array returned by storage client
        contents = blob.download_as_string().decode('ascii')
        # Strip out the lines which are empty (essentially the last line)
        lines = [line for line in contents.split('\n') if len(line) > 0]
        assert lines == data

        # One event should be created
        records = wiretap.output_records
        assert len(records) == 1
        assert records[0].get_field_data('/bucket') == bucket_name
        assert records[0].get_field_data('/recordCount') == 10

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_multiple_batches(sdc_builder, sdc_executor, gcp):
    """
    Test that we can write multiple batches and the pipeline writes all the data exactly once.

    The pipeline looks like:
        dev_raw_data_source >> google_cloud_storage_destination
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    bucket_name = "stf_std_" + get_random_string(string.ascii_lowercase, 20)

    batch_size = 100
    batches = 10

    dev_data_generator = pipeline_builder.add_stage('Dev Data Generator')

    # From SDC 4.1.0 onwards, Dev Data Generator has a new configuration called 'records_to_be_generated', which could be
    # used to guarantee that we generate exactly batch_size * num_batches records, but it is not used right now in order for
    # this test to be backwards compatible.
    dev_data_generator.set_attributes(batch_size=batch_size,
                                      fields_to_generate=[{
                                          'field':
                                          'text',
                                          'type':
                                          'LONG_SEQUENCE'
                                      }])

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        partition_prefix='test',
                                        data_format='TEXT')

    dev_data_generator >> google_cloud_storage

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Test Multiple Batches'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        sdc_executor.start_pipeline(
            pipeline).wait_for_pipeline_output_records_count(batch_size *
                                                             batches)
        sdc_executor.stop_pipeline(pipeline)

        # Now the pipeline will write some amount of records that will be larger, so we get precise count from metrics
        history = sdc_executor.get_pipeline_history(pipeline)
        records = history.latest.metrics.counter(
            'pipeline.batchInputRecords.counter').count
        logger.info(f"Detected {records} output records")
        # Sanity check
        assert records >= batch_size * batches, "Check that at least the required number of records have been generated"

        # GCS creates writes each batch in a different blob. Therefore, there should be 10 blobs, with 100 records each
        blob_iter = created_bucket.list_blobs(prefix='gcs-test')
        blobs = [blob for blob in blob_iter]
        assert len(blobs) >= batches, "GCS puts each batch in a different blob, so there are at least 10 blobs but" \
                                      "potentially more if incomplete batches were generated"
        result = []
        for blob in blobs:
            # Decode the byte array returned by storage client
            contents = blob.download_as_string().decode('ascii')
            # Strip out the lines which are empty (essentially the last line)
            lines = [line for line in contents.split('\n') if len(line) > 0]
            result.extend(lines)

        result.sort(key=float)
        assert result == [f'{i}' for i in range(0, records)], "Verify that every record has been written exactly once " \
                                                              "by checking the sequence"

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
Example #18
0
def test_google_cloud_storage_executor_change_object(sdc_builder, sdc_executor, gcp, with_bucket, with_object, with_el):
    """Test for Google Cloud Storage Executor stage, operation Change Object.
    """

    bucket_name_prefix = 'stf_gcse_b_'
    bucket_name = f'{bucket_name_prefix}{get_random_string(string.ascii_lowercase, 10)}'

    object_name_prefix = 'stf_gcse_o_'
    object_name = f'{object_name_prefix}{get_random_string(string.ascii_lowercase, 10)}'

    object_content = f'In a hole in the ground there lived a Hobbit'

    source_metadata = {'Dwarf':   'Gimli',
                       'Dunadan': 'Aragorn',
                       'Elf':     'Legolas',
                       'Hobbit':  'Frodo',
                       'Human':   'Boromir',
                       'Wizard':  'Gandalf'}
    source_object_metadata = {}
    i_s = 1
    for key in source_metadata:
        source_object_metadata[f'k{i_s:0>2d}'] = key
        source_object_metadata[f'v{i_s:0>2d}'] = source_metadata[key]
        i_s = i_s + 1
    target_metadata = {'Honinbo': 'Takemiya Masaki',
                       'Judan':   'Kato Masao',
                       'Kisei':   'Cho Chikun',
                       'Meijin':  'Ishida Yoshio'}
    target_object_metadata = {}
    i_t = 1
    for key in target_metadata:
        target_object_metadata[f'k{i_t:0>2d}'] = key
        target_object_metadata[f'v{i_t:0>2d}'] = target_metadata[key]
        i_t = i_t + 1

    raw_data_map = {'bucket': bucket_name,
                    'object': object_name,
                    'metadata': target_object_metadata}
    raw_data = json.dumps(raw_data_map)

    if with_el:
        bucket_specification = '${record:value("/bucket")}'
        object_specification = '${record:value("/object")}'
        metadata_specification = [{'key': '${record:value("/metadata/k01")}', 'value': '${record:value("/metadata/v01")}'},
                                  {'key': '${record:value("/metadata/k02")}', 'value': '${record:value("/metadata/v02")}'},
                                  {'key': '${record:value("/metadata/k03")}', 'value': '${record:value("/metadata/v03")}'},
                                  {'key': '${record:value("/metadata/k04")}', 'value': '${record:value("/metadata/v04")}'}]
    else:
        bucket_specification = bucket_name
        object_specification = object_name
        metadata_specification = []
        for key in target_metadata:
            metadata_specification.append({'key': key, 'value': target_metadata[key]})

    pipeline_name = f'{get_random_string(string.ascii_letters, 10)}'
    pipeline_title = f'Google Cloud Storage Test Pipeline - Change Object: {pipeline_name}'

    try:
        if with_bucket:
            logger.info(f'Creating temporary bucket {bucket_name}')
            bucket = gcp.retry_429(gcp.storage_client.create_bucket)(bucket_name)
            logger.info(f'Temporary bucket {bucket_name} successfully created')

            if with_object:
                logger.info(f'Creating temporary object {object_name}')
                existing_blob = bucket.blob(object_name)
                existing_blob.upload_from_string(object_content, content_type='text/plain')
                existing_blob.metadata = source_metadata
                existing_blob.patch()
                logger.info(f'Temporary object {object_name} successfully created')

        pipeline_builder = sdc_builder.get_pipeline_builder()

        dev_raw_data_source_origin = pipeline_builder.add_stage('Dev Raw Data Source')
        dev_raw_data_source_origin.set_attributes(data_format='JSON',
                                                  raw_data=raw_data,
                                                  stop_after_first_batch=True)

        google_cloud_storage_executor = pipeline_builder.add_stage('Google Cloud Storage', type='executor')
        google_cloud_storage_executor.set_attributes(task='CHANGE_OBJECT',
                                                     project_id=gcp.project_id,
                                                     bucket=bucket_specification,
                                                     object=object_specification,
                                                     metadata=metadata_specification)

        wiretap = pipeline_builder.add_wiretap()

        dev_raw_data_source_origin >> google_cloud_storage_executor >= wiretap.destination

        pipeline = pipeline_builder.build(title=pipeline_title).configure_for_environment(gcp)
        sdc_executor.add_pipeline(pipeline)

        if with_bucket and with_object:
            sdc_executor.start_pipeline(pipeline).wait_for_finished()

            assert len(wiretap.error_records) == 0, \
                'This execution mode was not expected to produce error records'
            assert len(wiretap.output_records) == 1, \
                'Only one input record, so exactly one output even record was expected'
            assert wiretap.output_records[0].header.values['sdc.event.type'] == 'gcs-object-changed', \
                'It was expected an event record signaling the object creation'

            existing_blob = bucket.get_blob(object_name)
            assert existing_blob is not None, f'Just created object ({object_name}) does not exist in this bucket'
            existing_object_content = existing_blob.download_as_string().decode('ascii')
            assert existing_object_content == object_content, 'Correct object but with unexpected contents'
            existing_object_metadata = collections.OrderedDict(sorted(existing_blob.metadata.items()))
            injected_object_metadata = collections.OrderedDict(sorted({**source_metadata, **target_metadata}.items()))
            assert existing_object_metadata == injected_object_metadata, 'Correct object but with unexpected metadata'
        else:
            sdc_executor.start_pipeline(pipeline).wait_for_finished()

            assert len(wiretap.error_records) == 1, \
                'This execution mode was expected to produce error records'
            assert len(wiretap.output_records) == 0, \
                'This execution mode was not expected to produce records'
            assert wiretap.error_records[0].header['errorCode'] == 'GCSE_014', \
                'This execution mode was expected to produce error GCSE_014: <Error changing GCS object>'
    finally:
        if with_bucket:
            logger.info(f'Deleting temporary bucket {bucket_name}')
            gcp.retry_429(bucket.delete)(force=True)
            logger.info(f'Temporary bucket {bucket_name} successfully deleted')
def test_multiple_batches(sdc_builder, sdc_executor, gcp):
    """
    Test that we can produce multiple batches and the pipeline processes all the data exactly once.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    max_batch_size = 100
    number_of_batches = 10

    google_cloud_storage.set_attributes(
        bucket=bucket_name,
        common_prefix='gcs-test',
        prefix_pattern='**/*.txt',
        data_format='TEXT',
        batch_wait_time_in_ms=20_000,
        max_batch_size_in_records=max_batch_size)
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Bucket Test Multiple Batches'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        data = [
            get_random_string(string.ascii_letters, 100)
            for _ in range(max_batch_size * number_of_batches)
        ]
        blob = created_bucket.blob('gcs-test/sdc-test.txt')
        blob.upload_from_string('\n'.join(data))

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(
            pipeline, 'input_record_count', max_batch_size * number_of_batches)
        sdc_executor.stop_pipeline(pipeline)

        rows_from_wiretap = [
            record.field['text'] for record in wiretap.output_records
        ]

        assert len(rows_from_wiretap) == max_batch_size * number_of_batches
        assert len(data) == len(rows_from_wiretap)
        assert rows_from_wiretap == data
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_whole_file(sdc_builder, sdc_executor, gcp):
    """
    Write file to LocalFS.
    Then, read the data and write to GCS using whole file format.

    The pipeline looks like:
        localFS >> google_cloud_storage_destination
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    data = [
        get_random_string(string.ascii_letters, length=100) for _ in range(10)
    ]
    bucket_name = "stf_std_" + get_random_string(string.ascii_lowercase, 20)
    gcp_file = get_random_string(string.ascii_lowercase, 20)

    directory = pipeline_builder.add_stage('Directory', type='origin')
    directory.set_attributes(data_format='WHOLE_FILE',
                             file_name_pattern=f'{gcp_file}*',
                             file_name_pattern_mode='GLOB',
                             file_post_processing='DELETE',
                             files_directory=TMPOUT,
                             process_subdirectories=False,
                             read_order='TIMESTAMP')

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='destination')
    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        data_format='WHOLE_FILE',
                                        file_name_expression=gcp_file)

    pipeline_finished_executor = pipeline_builder.add_stage(
        'Pipeline Finisher Executor')
    pipeline_finished_executor.set_attributes(
        stage_record_preconditions=["${record:eventType() == 'no-more-data'}"])

    directory >> google_cloud_storage
    directory >= pipeline_finished_executor

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Destination Test Data Format Whole File'
    ).configure_for_environment(gcp)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        write_whole_file_to_LocalFS(data, gcp_file, sdc_builder, sdc_executor,
                                    gcp)

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        blob_iter = created_bucket.list_blobs(max_results=1, prefix='gcs-test')
        blobs = [blob for blob in blob_iter]
        assert len(blobs) == 1
        blob = blobs[0]
        # Decode the byte array returned by storage client
        contents = blob.download_as_string().decode('ascii')
        # Strip out the lines which are empty (essentially the last line)
        lines = [line for line in contents.split('\n') if len(line) > 0]
        assert lines == data
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_protobuf(sdc_builder, sdc_executor, gcp):
    """
    Read data from Google cloud storage using protobuf data format.
    The file is created used a first pipeline. dev_raw -> local_fs
    The file is moved used a second pipeline.  local_fs -> gcp

    The final pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """

    message = '{"first_name": "Martin","last_name": "Balzamo"}'
    expected = json.loads(message)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(
        bucket=bucket_name,
        common_prefix='gcs-test',
        prefix_pattern='**/*',
        data_format='PROTOBUF',
        message_type='Contact',
        protobuf_descriptor_file=PROTOBUF_FILE_PATH)
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format Protobuf'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        gcp_file = get_random_string(string.ascii_lowercase, 20)
        produce_messages_protobuf(gcp_file, sdc_builder, sdc_executor, message,
                                  gcp)
        move_directory_messages_protobuf_gcp(gcp_file, bucket_name,
                                             sdc_builder, sdc_executor, gcp)

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              1)
        sdc_executor.stop_pipeline(pipeline)

        assert len(wiretap.output_records) == 1
        assert wiretap.output_records[0].field == expected
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_data_format_sdc_record(sdc_builder, sdc_executor, gcp):
    """
    Read data from Google cloud storage using text format.
    The file is created used a first pipeline. dev_raw -> local_fs
    The file is moved used a second pipeline.  local_fs -> gcp

    The final pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    json_data = [{
        "field1": "abc",
        "field2": "def",
        "field3": "ghi"
    }, {
        "field1": "jkl",
        "field2": "mno",
        "field3": "pqr"
    }]
    raw_data = ''.join(json.dumps(record) for record in json_data)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*',
                                        data_format='SDC_JSON')
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Data Format SDC Record'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)

        gcp_file = get_random_string(string.ascii_lowercase, 20)
        produce_messages_SDC_Record(gcp_file, sdc_builder, sdc_executor,
                                    raw_data, gcp)
        move_directory_messages_SDC_record_gcp(gcp_file, bucket_name,
                                               sdc_builder, sdc_executor, gcp)

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(
            pipeline).wait_for_pipeline_output_records_count(2)
        sdc_executor.stop_pipeline(pipeline)

        assert len(wiretap.output_records) == 2
        assert wiretap.output_records[0].field == json_data[0]
        assert wiretap.output_records[1].field == json_data[1]

    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_resume_offset(sdc_builder, sdc_executor, gcp):
    """
    Create a pipeline that reads half the data on a Google Cloud Storage origin. Stop the pipeline, start it again and
    test that it resumes from the correct offset.

    The pipeline looks like:
        google_cloud_storage_origin >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    max_batch_size = 100
    number_of_batches = 10
    half_batches = 5

    google_cloud_storage.set_attributes(
        bucket=bucket_name,
        common_prefix='gcs-test',
        prefix_pattern='**/*.txt',
        data_format='TEXT',
        batch_wait_time_in_ms=20_000,
        max_batch_size_in_records=max_batch_size)
    wiretap = pipeline_builder.add_wiretap()

    google_cloud_storage >> wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Bucket Test Resume Offset'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        data1 = [
            get_random_string(string.ascii_letters, 100)
            for _ in range(max_batch_size * half_batches)
        ]
        blob = created_bucket.blob('gcs-test/sdc-test.txt')
        blob.upload_from_string('\n'.join(data1))

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              max_batch_size * half_batches)
        sdc_executor.stop_pipeline(pipeline)

        first_data_half = [
            record.field['text'] for record in wiretap.output_records
        ]
        wiretap.reset()

        data2 = [
            get_random_string(string.ascii_letters, 100)
            for _ in range(max_batch_size * half_batches)
        ]
        blob.upload_from_string('\n'.join(data2))

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              max_batch_size * half_batches)
        sdc_executor.stop_pipeline(pipeline)

        second_data_half = [
            record.field['text'] for record in wiretap.output_records
        ]

        joined_result = first_data_half + second_data_half

        assert len(first_data_half) + len(
            second_data_half) == max_batch_size * number_of_batches
        assert len(first_data_half) + len(second_data_half) == len(
            data1) + len(data2)
        assert joined_result == data1 + data2
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)
def test_dataflow_events(sdc_builder, sdc_executor, gcp):
    """
    Write data to Google cloud storage and check if the no-more-data event is generated.
    GCS currently generates no other types of events.

    The pipeline looks like:
        google_cloud_storage_origin >> trash
        google_cloud_storage_origin >= wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    storage_client = gcp.storage_client

    google_cloud_storage = pipeline_builder.add_stage('Google Cloud Storage',
                                                      type='origin')

    bucket_name = get_random_string(string.ascii_lowercase, 20)

    google_cloud_storage.set_attributes(bucket=bucket_name,
                                        common_prefix='gcs-test',
                                        prefix_pattern='**/*.txt',
                                        data_format='TEXT')
    wiretap = pipeline_builder.add_wiretap()

    trash = pipeline_builder.add_stage("Trash")
    google_cloud_storage >> trash

    google_cloud_storage >= wiretap.destination

    pipeline = pipeline_builder.build(
        title=f'Google Cloud Storage Origin Test Dataflow Events'
    ).configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    try:
        created_bucket = gcp.retry_429(
            storage_client.create_bucket)(bucket_name)
        data = [
            get_random_string(string.ascii_letters, 100) for _ in range(10)
        ]
        blob = created_bucket.blob('gcs-test/sdc-test.txt')
        blob.upload_from_string('\n'.join(data))

        logger.info(
            'Starting GCS Origin pipeline and wait until the information is read ...'
        )
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              10)
        sdc_executor.stop_pipeline(pipeline)

        # One event should be created
        records = wiretap.output_records
        assert len(records) == 1

        # Event is no-more-data
        assert records[0].header.values['sdc.event.type'] == 'no-more-data'
        assert records[0].field['record-count'] == 10
    finally:
        logger.info('Deleting bucket %s ...', created_bucket.name)
        gcp.retry_429(created_bucket.delete)(force=True)