def test_state__doesnt_emit_when_it_isnt_different_than_the_previous_emission(capsys):
    config = CONFIG.copy()
    config['max_batch_rows'] = 5
    config['batch_detection_threshold'] = 1
    rows = list(CatStream(100))
    target = Target()

    def test_stream():
        yield rows[0]
        for row in rows[slice(1, 21)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})
        output = filtered_output(capsys)
        assert len(output) == 1

        for row in rows[slice(22, 99)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})

        output = filtered_output(capsys)
        assert len(output) == 0

    target_tools.stream_to_target(test_stream(), target, config=config)

    output = filtered_output(capsys)
    assert len(output) == 0
def test_state__emits_when_multiple_streams_are_registered_but_records_arrive_from_only_one(capsys):
    config = CONFIG.copy()
    config['max_batch_rows'] = 20
    config['batch_detection_threshold'] = 1
    cat_rows = list(CatStream(100))
    dog_rows = list(DogStream(50))
    target = Target()

    # Simulate one stream that yields a lot of records with another that yields no records, and ensure that only the first
    # needs to be flushed before any state messages are emitted
    def test_stream():
        yield cat_rows[0]
        yield dog_rows[0]
        for row in cat_rows[slice(1, 5)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})

        for row in cat_rows[slice(6, 25)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})

        # After some state messages and only one of the registered streams has hit the batch size, the state message should be emitted, as there are no unflushed records from the other stream yet
        assert len(target.calls['write_batch']) == 1
        output = filtered_output(capsys)
        assert len(output) == 1
        assert json.loads(output[0])['test'] == 'state-1'


    target_tools.stream_to_target(test_stream(), target, config=config)

    # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch
    output = filtered_output(capsys)
    assert len(output) == 1
    assert json.loads(output[0])['test'] == 'state-2'
def test_activate_version():
    config = CONFIG.copy()
    config['max_batch_rows'] = 20
    config['batch_detection_threshold'] = 11

    records = [{"type": "RECORD",
                "stream": "abc",
                "record": {},
                "version": 123}] * (config['batch_detection_threshold'] - 1)

    class TestStream(ListStream):
        stream = [
                     {"type": "SCHEMA",
                      "stream": "abc",
                      "schema": {
                          "type": "object",
                          "properties": {
                              'a': {'type': 'number'}}},
                      "key_properties": []}
                 ] + records + [
                     {'type': 'ACTIVATE_VERSION',
                      'stream': "abc",
                      'version': 123}
                 ] + records

    target = Target()

    target_tools.stream_to_target(TestStream(), target, config=config)

    rows_persisted = 0
    for call in target.calls['write_batch']:
        rows_persisted += call['records_count']

    expected_rows = (2 * len(records))
    assert rows_persisted == expected_rows
Esempio n. 4
0
def main(config, input_stream=None):
    with psycopg2.connect(
            connection_factory=MillisLoggingConnection,
            host=config.get('postgres_host', 'localhost'),
            port=config.get('postgres_port', 5432),
            dbname=config.get('postgres_database'),
            user=config.get('postgres_username'),
            password=config.get('postgres_password'),
            sslmode=config.get('postgres_sslmode'),
            sslcert=config.get('postgres_sslcert'),
            sslkey=config.get('postgres_sslkey'),
            sslrootcert=config.get('postgres_sslrootcert'),
            sslcrl=config.get('postgres_sslcrl'),
            application_name=config.get('application_name', 'target-postgres'),
    ) as connection:
        postgres_target = PostgresTarget(
            connection,
            postgres_schema=config.get('postgres_schema', 'public'),
            logging_level=config.get('logging_level'),
            persist_empty_tables=config.get('persist_empty_tables'),
            add_upsert_indexes=config.get('add_upsert_indexes', True),
            before_run_sql=config.get('before_run_sql'),
            after_run_sql=config.get('after_run_sql'),
        )

        if input_stream:
            target_tools.stream_to_target(input_stream,
                                          postgres_target,
                                          config=config)
        else:
            target_tools.main(postgres_target)
Esempio n. 5
0
def main(config, input_stream=None):
    with psycopg2.connect(
            connection_factory=MillisLoggingConnection,
            host=config.get('redshift_host'),
            port=config.get('redshift_port', 5439),
            dbname=config.get('redshift_database'),
            user=config.get('redshift_username'),
            password=config.get('redshift_password')
    ) as connection:
        s3_config = config.get('target_s3')
        s3 = S3(s3_config.get('aws_access_key_id'),
                s3_config.get('aws_secret_access_key'),
                s3_config.get('aws_session_token'),
                s3_config.get('region_name'),
                s3_config.get('bucket'),
                s3_config.get('key_prefix'))

        redshift_target = RedshiftTarget(
            connection,
            s3,
            redshift_schema=config.get('redshift_schema', 'public'),
            logging_level=config.get('logging_level'),
            default_column_length=config.get('default_column_length', 1000),
            persist_empty_tables=config.get('persist_empty_tables')
        )

        if input_stream:
            target_tools.stream_to_target(
                input_stream, redshift_target, config=config)
        else:
            target_tools.main(redshift_target)
Esempio n. 6
0
def main(config, input_stream=None):
    tunnel = None
    try:
        LOGGER.info(config)
        if bool(config.get('use_ssh_tunnel')) == True:
            LOGGER.info(
                f"use_ssh_tunnel is set to true; connecting to {config['redshift_host']}:{config['redshift_port']} via {config['ssh_jump_server']}:{config['ssh_jump_server_port']}"
            )
            tunnel = sshtunnel.open_tunnel(
                (config['ssh_jump_server'], int(
                    config['ssh_jump_server_port'])),
                ssh_username=config['ssh_username'],
                ssh_pkey=config['ssh_private_key_path'],
                ssh_private_key_password=config['ssh_private_key_password']
                if 'ssh_private_key_password' in config else None,
                remote_bind_address=(config['redshift_host'],
                                     int(config['redshift_port'])))
            tunnel.start()
            time.sleep(1)
            config[
                'redshift_host'] = '127.0.0.1'  # rewrite the config to go through the tunnel
            config['redshift_port'] = tunnel.local_bind_port
        else:
            LOGGER.debug(
                f"use_ssh_tunnel is not set or is false; connecting directly to {config['redshift_host']}:{config['redshift_port']}"
            )

        with psycopg2.connect(
                connection_factory=MillisLoggingConnection,
                host=config.get('redshift_host'),
                port=config.get('redshift_port', 5439),
                dbname=config.get('redshift_database'),
                user=config.get('redshift_username'),
                password=config.get('redshift_password')) as connection:
            s3_config = config.get('target_s3')
            s3 = S3(s3_config.get('aws_access_key_id'),
                    s3_config.get('aws_secret_access_key'),
                    s3_config.get('bucket'),
                    s3_config.get('key_prefix'),
                    aws_session_token=s3_config.get('aws_session_token'))

            redshift_target = RedshiftTarget(
                connection,
                s3,
                redshift_schema=config.get('redshift_schema', 'public'),
                logging_level=config.get('logging_level'),
                default_column_length=config.get('default_column_length',
                                                 1000),
                persist_empty_tables=config.get('persist_empty_tables'))

            if input_stream:
                target_tools.stream_to_target(input_stream,
                                              redshift_target,
                                              config=config)
            else:
                target_tools.main(redshift_target)

    finally:
        if tunnel is not None:
            tunnel.stop()
def main(config, input_stream=None):
    with connect(user=config.get('snowflake_username'),
                 password=config.get('snowflake_password'),
                 role=config.get('snowflake_role'),
                 authenticator=config.get('snowflake_authenticator',
                                          'snowflake'),
                 account=config.get('snowflake_account'),
                 warehouse=config.get('snowflake_warehouse'),
                 database=config.get('snowflake_database'),
                 schema=config.get('snowflake_schema', 'PUBLIC'),
                 autocommit=False) as connection:
        s3_config = config.get('target_s3')

        s3 = None
        if s3_config:
            s3 = S3(s3_config.get('aws_access_key_id'),
                    s3_config.get('aws_secret_access_key'),
                    s3_config.get('bucket'), s3_config.get('key_prefix'))

        target = SnowflakeTarget(
            connection,
            s3=s3,
            logging_level=config.get('logging_level'),
            persist_empty_tables=config.get('persist_empty_tables'))

        if input_stream:
            target_tools.stream_to_target(input_stream, target, config=config)
        else:
            target_tools.main(target)
def test_state__capture_can_be_disabled(capsys):
    stream = [
        json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}),
        json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})]

    target_tools.stream_to_target(stream, Target(), {'state_support': False})
    output = filtered_output(capsys)

    assert len(output) == 0
def test_loading__invalid__records__threshold():
    config = deepcopy(CONFIG)
    config['invalid_records_threshold'] = 10

    target = Target()

    with pytest.raises(singer_stream.SingerStreamError, match=r'.*.10*'):
        target_tools.stream_to_target(InvalidCatStream(20), target, config=config)

    assert len(target.calls['write_batch']) == 0
def test_state__capture(capsys):
    stream = [
        json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}),
        json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})]

    target_tools.stream_to_target(stream, Target())
    output = filtered_output(capsys)

    assert len(output) == 2
    assert json.loads(output[0])['test'] == 'state-1'
    assert json.loads(output[1])['test'] == 'state-2'
def test_loading__invalid__records__disable():
    config = deepcopy(CONFIG)
    config['invalid_records_detect'] = False

    target = Target()

    target_tools.stream_to_target(InvalidCatStream(100), target, config=config)

    ## Since all `cat`s records were invalid, we could not persist them, hence, no calls made to `write_batch`
    assert len(target.calls['write_batch']) == 1
    assert target.calls['write_batch'][0]['records_count'] == 0
def test_state__emits_most_recent_state_when_final_flush_occurs(capsys):
    config = CONFIG.copy()
    config['max_batch_rows'] = 20
    config['batch_detection_threshold'] = 1
    rows = list(CatStream(5))
    rows.append(json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}))

    target_tools.stream_to_target(rows, Target(), config=config)

    # The final state message should have been outputted after the last records were loaded despite not reaching
    # one full flushable batch
    output = filtered_output(capsys)
    assert len(output) == 1
    assert json.loads(output[0])['test'] == 'state-1'
def test_usage_stats():
    config = deepcopy(CONFIG)
    assert config['disable_collection']

    with patch.object(target_tools, '_async_send_usage_stats') as mock:
        target_tools.stream_to_target([], None, config=config)

        assert mock.call_count == 0

        config['disable_collection'] = False

        target_tools.stream_to_target([], None, config=config)

        assert mock.call_count == 1
def test_state__doesnt_emit_when_only_one_of_several_streams_is_flushing(
        capsys):
    config = CONFIG.copy()
    config['max_batch_rows'] = 20
    config['batch_detection_threshold'] = 1
    cat_rows = list(CatStream(100))
    dog_rows = list(DogStream(50))
    target = Target()

    # Simulate one stream that yields a lot of records with another that yields few records and ensure both need to be flushed
    # before any state messages are emitted
    def test_stream():
        yield cat_rows[0]
        yield dog_rows[0]
        for row in cat_rows[slice(1, 5)]:
            yield row
        for row in dog_rows[slice(1, 5)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})

        for row in cat_rows[slice(6, 45)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})

        for row in cat_rows[slice(46, 65)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}})

        # After some state messages but before the batch size has been hit for both streams no state messages should have been emitted
        assert len(target.calls['write_batch']) == 3
        output = filtered_output(capsys)
        assert output == []

        for row in dog_rows[slice(6, 25)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}})

        # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted
        assert len(target.calls['write_batch']) == 4
        output = filtered_output(capsys)
        assert len(output) == 1
        assert json.loads(output[0])['value']['test'] == 'state-2'

    target_tools.stream_to_target(test_stream(), target, config=config)

    # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch
    output = filtered_output(capsys)
    assert len(output) == 1
    assert json.loads(output[0])['value']['test'] == 'state-4'
Esempio n. 15
0
def main(config, input_stream=None):
    with psycopg2.connect(
            host=config.get('postgres_host', 'localhost'),
            port=config.get('postgres_port', 5432),
            dbname=config.get('postgres_database'),
            user=config.get('postgres_username'),
            password=config.get('postgres_password')) as connection:
        postgres_target = PostgresTarget(connection,
                                         postgres_schema=config.get(
                                             'postgres_schema', 'public'))

        if input_stream:
            target_tools.stream_to_target(input_stream,
                                          postgres_target,
                                          config=config)
        else:
            target_tools.main(postgres_target)
def test_state__emits_only_messages_when_all_records_before_have_been_flushed(
        capsys):
    config = CONFIG.copy()
    config['max_batch_rows'] = 20
    config['batch_detection_threshold'] = 1
    rows = list(CatStream(100))
    target = Target()

    def test_stream():
        yield rows[0]
        for row in rows[slice(1, 5)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})
        for row in rows[slice(6, 10)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})
        for row in rows[slice(11, 15)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}})

        # After some state messages but before the batch size has been hit no state messages should have been emitted
        assert len(target.calls['write_batch']) == 0
        output = filtered_output(capsys)
        assert output == []

        for row in rows[slice(16, 25)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}})

        # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted
        assert len(target.calls['write_batch']) == 1
        output = filtered_output(capsys)
        assert len(output) == 1
        assert json.loads(output[0])['value']['test'] == 'state-3'

        for row in rows[slice(26, 31)]:
            yield row

    target_tools.stream_to_target(test_stream(), target, config=config)

    # The final state message should have been outputted after the last records were loaded
    output = filtered_output(capsys)
    assert len(output) == 1
    assert json.loads(output[0])['value']['test'] == 'state-4'
Esempio n. 17
0
def main(config, input_stream=None):
    with psycopg2.connect(
            connection_factory=MillisLoggingConnection,
            host=config.get('postgres_host', 'localhost'),
            port=config.get('postgres_port', 5432),
            dbname=config.get('postgres_database'),
            user=config.get('postgres_username'),
            password=config.get('postgres_password')) as connection:
        postgres_target = PostgresTarget(
            connection,
            postgres_schema=config.get('postgres_schema', 'public'),
            logging_level=config.get('logging_level'),
            persist_empty_tables=config.get('persist_empty_tables'))

        if input_stream:
            target_tools.stream_to_target(input_stream,
                                          postgres_target,
                                          config=config)
        else:
            target_tools.main(postgres_target)
def test_record_with_multiple_of():
    values = [1, 1.0, 2, 2.0, 3, 7, 10.1]
    records = []
    for value in values:
        records.append({
            "type": "RECORD",
            "stream": "test",
            "record": {
                "multipleOfKey": value
            },
        })

    class TestStream(ListStream):
        stream = [{
            "type": "SCHEMA",
            "stream": "test",
            "schema": {
                "properties": {
                    "multipleOfKey": {
                        "type": "number",
                        "multipleOf": 1e-15
                    }
                }
            },
            "key_properties": []
        }] + records

    target = Target()

    target_tools.stream_to_target(TestStream(), target, config=CONFIG.copy())

    expected_rows = len(records)
    rows_persisted = 0
    for call in target.calls['write_batch']:
        rows_persisted += call['records_count']

    assert rows_persisted == expected_rows
def test_loading__invalid__records():
    with pytest.raises(singer_stream.SingerStreamError, match=r'.*'):
        target_tools.stream_to_target(InvalidCatStream(1), None, config=CONFIG)