Exemple #1
0
def test_sftp_origin(sdc_builder, sdc_executor, sftp):
    """Smoke test SFTP origin. We first create a file on SFTP server and have the SFTP origin stage read it.
    We then assert its snapshot. The pipeline look like:
        sftp_ftp_client >> trash
    """
    sftp_file_name = get_random_string(string.ascii_letters, 10)
    raw_text_data = 'Hello World!'
    sftp.put_string(os.path.join(sftp.path, sftp_file_name), raw_text_data)

    builder = sdc_builder.get_pipeline_builder()
    sftp_ftp_client = builder.add_stage(name='com_streamsets_pipeline_stage_origin_remote_RemoteDownloadDSource')
    sftp_ftp_client.file_name_pattern = sftp_file_name
    sftp_ftp_client.data_format = 'TEXT'

    trash = builder.add_stage('Trash')

    sftp_ftp_client >> trash
    sftp_ftp_client_pipeline = builder.build('SFTP Origin Pipeline').configure_for_environment(sftp)
    sdc_executor.add_pipeline(sftp_ftp_client_pipeline)

    snapshot = sdc_executor.capture_snapshot(sftp_ftp_client_pipeline, start_pipeline=True).snapshot
    sdc_executor.stop_pipeline(sftp_ftp_client_pipeline)

    assert len(snapshot[sftp_ftp_client].output) == 1
    assert snapshot[sftp_ftp_client].output[0].field['text'] == raw_text_data

    # Delete the test SFTP origin file we created
    transport, client = sftp.client
    try:
        client.remove(os.path.join(sftp.path, sftp_file_name))
    finally:
        client.close()
        transport.close()
def test_sftp_origin(sdc_builder, sdc_executor, sftp):
    """Smoke test SFTP origin. We first create a file on SFTP server and have the SFTP origin stage read it.
    We then assert the ingested data using wiretap. The pipeline looks like:
        sftp_ftp_client >> wiretap
    """
    sftp_file_name = get_random_string(string.ascii_letters, 10)
    raw_text_data = 'Hello World!'
    sftp.put_string(os.path.join(sftp.path, sftp_file_name), raw_text_data)

    builder = sdc_builder.get_pipeline_builder()
    sftp_ftp_client = builder.add_stage(name='com_streamsets_pipeline_stage_origin_remote_RemoteDownloadDSource')
    sftp_ftp_client.file_name_pattern = sftp_file_name
    sftp_ftp_client.data_format = 'TEXT'

    wiretap = builder.add_wiretap()

    sftp_ftp_client >> wiretap.destination
    sftp_ftp_client_pipeline = builder.build('SFTP Origin Pipeline').configure_for_environment(sftp)
    sdc_executor.add_pipeline(sftp_ftp_client_pipeline)

    sdc_executor.start_pipeline(sftp_ftp_client_pipeline).wait_for_pipeline_output_records_count(1)
    sdc_executor.stop_pipeline(sftp_ftp_client_pipeline)

    assert len(wiretap.output_records) == 1
    assert wiretap.output_records[0].field['text'] == raw_text_data

    # Delete the test SFTP origin file we created
    transport, client = sftp.client
    try:
        client.remove(os.path.join(sftp.path, sftp_file_name))
    finally:
        client.close()
        transport.close()
Exemple #3
0
def test_sftp_origin_whole_file_to_s3(sdc_builder, sdc_executor, sftp, aws):
    """This is a test for SDC-11273.  First, it creates a large (~6MB) file and puts it on the SFTP server.
    Then, it creates a pipeline with SFTP origin and S3 destination, with whole file format, and runs
    until the single record (file) is complete.  Then, it asserts the S3 bucket contents are correct.
    It passes only if the new option ("Disable Read Ahead Stream") is enabled.
    """
    sftp_file_name = get_random_string(string.ascii_letters, 10) + '.txt'
    raw_text_data = get_random_string(string.printable, 6000000)
    sftp.put_string(os.path.join(sftp.path, sftp_file_name), raw_text_data)

    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_BUCKET_PREFIX}/{sftp_file_name}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    sftp_ftp_client = builder.add_stage(name='com_streamsets_pipeline_stage_origin_remote_RemoteDownloadDSource')
    sftp_ftp_client.file_name_pattern = sftp_file_name
    sftp_ftp_client.data_format = 'WHOLE_FILE'
    if Version(sdc_builder.version) >= Version('3.8.2'):
        # Disable read-ahead stream as workaround for sshj library issues (see SDC-11273).
        sftp_ftp_client.disable_read_ahead_stream = True

    s3_destination = builder.add_stage('Amazon S3', type='destination')
    s3_destination.file_name_expression = "${record:value('/fileInfo/filename')}"
    s3_destination.set_attributes(bucket=s3_bucket, data_format='WHOLE_FILE', partition_prefix=s3_key)

    sftp_ftp_client >> s3_destination

    sftp_to_s3_pipeline = builder.build().configure_for_environment(aws, sftp)
    sdc_executor.add_pipeline(sftp_to_s3_pipeline)

    client = aws.s3
    try:
        # start pipeline and run for one record (the file)
        sdc_executor.start_pipeline(sftp_to_s3_pipeline).wait_for_pipeline_output_records_count(1)
        sdc_executor.stop_pipeline(sftp_to_s3_pipeline)

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == 1

        # read data from S3 to assert contents
        s3_contents = [client.get_object(Bucket=s3_bucket, Key=s3_content['Key'])['Body'].read().decode().strip()
                       for s3_content in list_s3_objs['Contents']]

        # compare the S3 bucket contents against the original whole file contents
        assert s3_contents[0] == raw_text_data
    finally:
        delete_keys = {'Objects': [{'Key': k['Key']}
                                   for k in client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)['Contents']]}
        client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
def test_sftp_origin_open_files(sdc_builder, sdc_executor, sftp, ssh):
    """Test SFTP origin to see if it leaves any open files on the SSH server after its pipeline has processed records.
    We first create a file on SFTP server and have the SFTP origin stage read it. We then check if open files are left
    after the pipeline processes all data. The pipeline look like:
        sftp_ftp_client >> trash
    """
    sftp_file_name = get_random_string(string.ascii_letters, 10)
    raw_text_data = 'Hello World!'
    logger.debug('Creating file at %s/%s on SFTP server ...', sftp.path,
                 sftp_file_name)
    sftp.put_string(os.path.join(sftp.path, sftp_file_name), raw_text_data)

    builder = sdc_builder.get_pipeline_builder()
    sftp_ftp_client = builder.add_stage(
        name='com_streamsets_pipeline_stage_origin_remote_RemoteDownloadDSource'
    )
    sftp_ftp_client.file_name_pattern = sftp_file_name
    sftp_ftp_client.data_format = 'TEXT'

    trash = builder.add_stage('Trash')

    sftp_ftp_client >> trash
    sftp_ftp_client_pipeline = builder.build(
        'SFTP Origin open file check Pipeline').configure_for_environment(sftp)
    sdc_executor.add_pipeline(sftp_ftp_client_pipeline)
    start_command = sdc_executor.start_pipeline(sftp_ftp_client_pipeline)

    ssh_client = ssh.client
    try:
        # make sure pipeline has processed all data
        start_command.wait_for_pipeline_output_records_count(1)
        # since now pipeline has processed all records, make sure it has not left a open stream to the remote file
        ssh_stdin, ssh_stdout, ssh_stderr = ssh_client.exec_command(
            f'lsof | grep {sftp_file_name}')
        lsof_status = ssh_stdout.channel.recv_exit_status()
        assert lsof_status == 1
    finally:
        sdc_executor.stop_pipeline(sftp_ftp_client_pipeline)
        ssh_client.close()
        # Delete the test SFTP origin file we created
        transport, sftp_client = sftp.client
        try:
            logger.debug('Removing file at %s/%s on SFTP server ...',
                         sftp.path, sftp_file_name)
            sftp_client.remove(os.path.join(sftp.path, sftp_file_name))
        finally:
            sftp_client.close()
            transport.close()
def test_sftp_origin_whole_file_to_s3_no_read_permission(sdc_builder, sdc_executor, sftp):
    """This is a test for SDC-14867.  It creates a file with no read permissions and creates one more file
    with read permissions, when the pipeline runs we will start ingesting from the second file and first
    file is skipped and an error is reported. We also drop another file when the pipeline is running and
    see whether that is also picked up rightly.
     """
    prefix = get_random_string(string.ascii_letters, 5)

    sftp_file_name1 = f'{prefix}{get_random_string(string.ascii_letters, 10)}.txt'
    sftp_file_name2 = f'{prefix}{get_random_string(string.ascii_letters, 10)}.txt'
    sftp_file_name3 = f'{prefix}{get_random_string(string.ascii_letters, 10)}.txt'

    raw_text_data = get_random_string(string.printable, 1000)

    sftp.put_string(os.path.join(sftp.path, sftp_file_name1), raw_text_data)
    sftp.chmod(os.path.join(sftp.path, sftp_file_name1), 000)

    sftp.put_string(os.path.join(sftp.path, sftp_file_name2), raw_text_data)

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    sftp_ftp_client = builder.add_stage('SFTP/FTP/FTPS Client', type='origin')
    sftp_ftp_client.set_attributes(file_name_pattern = f'{prefix}*',
                                   data_format = 'WHOLE_FILE',
                                   batch_wait_time_in_ms = 10000,
                                   max_batch_size_in_records = 1)

    trash = builder.add_stage('Trash')

    wiretap = builder.add_wiretap()

    sftp_ftp_client >> [wiretap.destination, trash]
    sftp_to_trash_pipeline = builder.build().configure_for_environment(sftp)
    sdc_executor.add_pipeline(sftp_to_trash_pipeline)
    try:
        # Wait to capture snapshot till 5 batches
        start_command = sdc_executor.start_pipeline(sftp_to_trash_pipeline)
        start_command.wait_for_pipeline_batch_count(2)

        sftp.put_string(os.path.join(sftp.path, sftp_file_name3), raw_text_data)
        start_command.wait_for_pipeline_batch_count(5)

        error_msgs = sdc_executor.get_stage_errors(sftp_to_trash_pipeline, sftp_ftp_client)

        # Verify the stage error message
        assert 'REMOTE_DOWNLOAD_10' in [e.error_code for e in error_msgs]

        actual_records = [record.field['fileInfo']['filename'] for record in wiretap.output_records]
        sdc_executor.stop_pipeline(sftp_to_trash_pipeline)
        wiretap.reset()

        assert [sftp_file_name2, sftp_file_name3] == actual_records
    finally:
        # Delete the test SFTP origin files we created
        transport, sftp_client = sftp.client
        sftp_client.chmod(os.path.join(sftp.path, sftp_file_name1), 700)
        try:
            for sftp_file_name in [sftp_file_name1, sftp_file_name2, sftp_file_name3]:
                logger.debug('Removing file at %s/%s on SFTP server ...', sftp.path, sftp_file_name)
                sftp_client.remove(os.path.join(sftp.path, sftp_file_name))
        finally:
            sftp_client.close()
            transport.close()