Example #1
0
def test_upsert(db_cleanup):
    stream = CatStream(100)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            assert cur.fetchone()[0] == 100
        assert_records(conn, stream.records, 'cats', 'id')

    stream = CatStream(100)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            assert cur.fetchone()[0] == 100
        assert_records(conn, stream.records, 'cats', 'id')

    stream = CatStream(200)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            assert cur.fetchone()[0] == 200
        assert_records(conn, stream.records, 'cats', 'id')
def test_deduplication_existing_new_rows(db_prep):
    stream = CatStream(100, nested_count=2)
    main(CONFIG, input_stream=stream)

    original_sequence = stream.sequence

    stream = CatStream(100, nested_count=2, sequence=original_sequence - 20)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('CATS'))
            table_count = cur.fetchone()[0]
            cur.execute(get_count_sql('CATS__ADOPTION__IMMUNIZATIONS'))
            nested_table_count = cur.fetchone()[0]

            cur.execute('''
                SELECT DISTINCT "_SDC_SEQUENCE"
                FROM {}.{}.{}
            '''.format(sql.identifier(CONFIG['snowflake_database']),
                       sql.identifier(CONFIG['snowflake_schema']),
                       sql.identifier('CATS')))
            sequences = cur.fetchall()

    assert table_count == 100
    assert nested_table_count == 200

    assert len(sequences) == 1
    assert sequences[0][0] == original_sequence
Example #3
0
def test_loading__invalid__configuration__schema():
    stream = CatStream(1)
    stream.schema = deepcopy(stream.schema)
    stream.schema['schema']['type'] = 'invalid type for a JSON Schema'

    with pytest.raises(Exception, match=r'.*invalid JSON Schema instance.*'):
        main(CONFIG, input_stream=stream)
Example #4
0
def test_multiple_batches_by_memory_upsert(db_cleanup):
    config = CONFIG.copy()
    config['max_batch_size'] = 1024
    config['batch_detection_threshold'] = 5

    stream = CatStream(100, nested_count=2)
    main(config, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            assert cur.fetchone()[0] == 100
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            assert cur.fetchone()[0] == 200
        assert_records(conn, stream.records, 'cats', 'id')

    stream = CatStream(100, nested_count=3)
    main(config, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            assert cur.fetchone()[0] == 100
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            assert cur.fetchone()[0] == 300
        assert_records(conn, stream.records, 'cats', 'id')
def test_deduplication_existing_new_rows(db_prep):
    stream = CatStream(100, nested_count=2)
    main(CONFIG, input_stream=stream)

    original_sequence = stream.sequence

    stream = CatStream(100, nested_count=2, sequence=original_sequence - 20)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            table_count = cur.fetchone()[0]
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            nested_table_count = cur.fetchone()[0]

            cur.execute(
                sql.SQL('SELECT DISTINCT _sdc_sequence FROM {}.{}').format(
                    sql.Identifier(CONFIG['redshift_schema']),
                    sql.Identifier('cats')))
            sequences = cur.fetchall()

    assert table_count == 100
    assert nested_table_count == 200

    assert len(sequences) == 1
    assert sequences[0][0] == original_sequence
def test_add_record_message():
    stream = CatStream(10)
    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'],
                                         CATS_SCHEMA['key_properties'])
    assert singer_stream.add_record_message(
        stream.generate_record_message()) is None
    assert not singer_stream.peek_invalid_records()
    assert [] == missing_sdc_properties(singer_stream)
def test_loading__empty__enabled_config__repeatability(db_prep):
    config = CONFIG.copy()
    config['persist_empty_tables'] = True

    main(config, input_stream=CatStream(0))

    main(config, input_stream=CatStream(0))

    main(config, input_stream=CatStream(0))
def test_multiple_batches__old_records__by_rows():
    stream_oldest = CatStream(100, version=0)
    stream_middle_aged = CatStream(100, version=5)
    stream_latest = CatStream(100, version=10)

    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'],
                                         CATS_SCHEMA['key_properties'],
                                         max_rows=20)

    assert len(singer_stream.peek_buffer()) == 0

    while not singer_stream.buffer_full:
        singer_stream.add_record_message(
            stream_oldest.generate_record_message())

    assert len(singer_stream.peek_buffer()) == 20

    singer_stream.flush_buffer()

    assert len(singer_stream.peek_buffer()) == 0

    singer_stream.add_record_message(stream_latest.generate_record_message())

    assert len(singer_stream.peek_buffer()) == 1

    reasonable_cutoff = 1000
    while not singer_stream.buffer_full and reasonable_cutoff != 0:
        singer_stream.add_record_message(
            stream_middle_aged.generate_record_message())
        reasonable_cutoff -= 1

    assert reasonable_cutoff == 0
    assert len(singer_stream.peek_buffer()) == 1
    assert [] == missing_sdc_properties(singer_stream)
Example #9
0
def test_upsert__invalid__primary_key_change(db_cleanup):
    stream = CatStream(100)
    main(CONFIG, input_stream=stream)

    stream = CatStream(100)
    schema = deepcopy(stream.schema)
    schema['key_properties'].append('name')
    stream.schema = schema

    with pytest.raises(postgres.PostgresError, match=r'.*key_properties.*'):
        main(CONFIG, input_stream=stream)
Example #10
0
def test_loading__invalid__column_type_change__pks__nullable():
    main(CONFIG, input_stream=CatStream(20))

    stream = CatStream(20)
    stream.schema = deepcopy(stream.schema)
    stream.schema['schema']['properties']['id'] = json_schema.make_nullable(
        stream.schema['schema']['properties']['id'])

    with pytest.raises(postgres.PostgresError,
                       match=r'.*key_properties. type change detected'):
        main(CONFIG, input_stream=stream)
def test_full_table_replication(db_prep):
    stream = CatStream(110, version=0, nested_count=3)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('CATS'))
            version_0_count = cur.fetchone()[0]
            cur.execute(get_count_sql('CATS__ADOPTION__IMMUNIZATIONS'))
            version_0_sub_count = cur.fetchone()[0]
        assert_records(conn, stream.records, 'CATS', 'ID', match_pks=True)

    assert version_0_count == 110
    assert version_0_sub_count == 330

    stream = CatStream(100, version=1, nested_count=3)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('CATS'))
            version_1_count = cur.fetchone()[0]
            cur.execute(get_count_sql('CATS__ADOPTION__IMMUNIZATIONS'))
            version_1_sub_count = cur.fetchone()[0]
        assert_records(conn, stream.records, 'CATS', 'ID', match_pks=True)

    assert version_1_count == 100
    assert version_1_sub_count == 300

    stream = CatStream(120, version=2, nested_count=2)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('CATS'))
            version_2_count = cur.fetchone()[0]
            cur.execute(get_count_sql('CATS__ADOPTION__IMMUNIZATIONS'))
            version_2_sub_count = cur.fetchone()[0]
        assert_records(conn, stream.records, 'CATS', 'ID', match_pks=True)

    assert version_2_count == 120
    assert version_2_sub_count == 240

    ## Test that an outdated version cannot overwrite
    stream = CatStream(314, version=1, nested_count=2)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('CATS'))
            older_version_count = cur.fetchone()[0]

    assert older_version_count == version_2_count
def test_deduplication_older_rows(db_prep):
    stream = CatStream(100,
                       nested_count=2,
                       duplicates=2,
                       duplicate_sequence_delta=-100)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('CATS'))
            table_count = cur.fetchone()[0]
            cur.execute(get_count_sql('CATS__ADOPTION__IMMUNIZATIONS'))
            nested_table_count = cur.fetchone()[0]

            cur.execute('''
                SELECT "_SDC_SEQUENCE"
                FROM {}.{}.{}
                WHERE "ID" in ({})
            '''.format(
                sql.identifier(CONFIG['snowflake_database']),
                sql.identifier(CONFIG['snowflake_schema']),
                sql.identifier('CATS'),
                ','.join(["'{}'".format(x)
                          for x in stream.duplicate_pks_used])))
            dup_cat_records = cur.fetchall()

    assert stream.record_message_count == 102
    assert table_count == 100
    assert nested_table_count == 200

    for record in dup_cat_records:
        assert record[0] == stream.sequence
def test_state__doesnt_emit_when_it_isnt_different_than_the_previous_emission(
        capsys):
    config = CONFIG.copy()
    config['max_batch_rows'] = 5
    config['batch_detection_threshold'] = 1
    rows = list(CatStream(100))
    target = Target()

    def test_stream():
        yield rows[0]
        for row in rows[slice(1, 21)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})
        output = filtered_output(capsys)
        assert len(output) == 1

        for row in rows[slice(22, 99)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})

        output = filtered_output(capsys)
        assert len(output) == 0

    target_tools.stream_to_target(test_stream(), target, config=config)

    output = filtered_output(capsys)
    assert len(output) == 0
Example #14
0
def test_loading__simple(db_cleanup):
    stream = CatStream(100)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_columns_sql('cats'))
            columns = cur.fetchall()

            assert set(columns) == {
                ('_sdc_batched_at', 'timestamp with time zone', 'YES'),
                ('_sdc_received_at', 'timestamp with time zone', 'YES'),
                ('_sdc_sequence', 'bigint', 'YES'),
                ('_sdc_table_version', 'bigint', 'YES'),
                ('adoption__adopted_on', 'timestamp with time zone', 'YES'),
                ('adoption__was_foster', 'boolean', 'YES'),
                ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'),
                ('name', 'text', 'NO'), ('pattern', 'text', 'YES')
            }

            cur.execute(get_columns_sql('cats__adoption__immunizations'))
            columns = cur.fetchall()

            assert set(columns) == {('_sdc_level_0_id', 'bigint', 'NO'),
                                    ('_sdc_sequence', 'bigint', 'YES'),
                                    ('_sdc_source_key_id', 'bigint', 'NO'),
                                    ('date_administered',
                                     'timestamp with time zone', 'YES'),
                                    ('type', 'text', 'YES')}

            cur.execute(get_count_sql('cats'))
            assert cur.fetchone()[0] == 100

        assert_records(conn, stream.records, 'cats', 'id')
def test_deduplication_newer_rows(db_prep):
    stream = CatStream(100, nested_count=3, duplicates=2)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            table_count = cur.fetchone()[0]
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            nested_table_count = cur.fetchone()[0]

            cur.execute(
                sql.SQL('SELECT _sdc_sequence FROM {}.{} WHERE id in ' +
                        '({})'.format(','.join(
                            map(str, stream.duplicate_pks_used)))).format(
                                sql.Identifier(CONFIG['redshift_schema']),
                                sql.Identifier('cats'),
                                sql.Literal(','.join(
                                    map(str, stream.duplicate_pks_used)))))
            dup_cat_records = cur.fetchall()

    assert stream.record_message_count == 102
    assert table_count == 100
    assert nested_table_count == 300

    for record in dup_cat_records:
        assert record[0] == stream.sequence + 200
Example #16
0
def test_deduplication_older_rows(db_cleanup):
    stream = CatStream(100,
                       nested_count=2,
                       duplicates=2,
                       duplicate_sequence_delta=-100)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            table_count = cur.fetchone()[0]
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            nested_table_count = cur.fetchone()[0]

            cur.execute(
                'SELECT _sdc_sequence FROM cats WHERE id in ({})'.format(
                    ','.join(map(str, stream.duplicate_pks_used))))
            dup_cat_records = cur.fetchall()

    assert stream.record_message_count == 102
    assert table_count == 100
    assert nested_table_count == 200

    for record in dup_cat_records:
        assert record[0] == stream.sequence
def test_loading__empty__enabled_config(db_prep):
    config = CONFIG.copy()
    config['persist_empty_tables'] = True

    stream = CatStream(0)
    main(config, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            assert_columns_equal(
                cur, 'CATS',
                {('_SDC_BATCHED_AT', 'TIMESTAMP_TZ', 'YES'),
                 ('_SDC_RECEIVED_AT', 'TIMESTAMP_TZ', 'YES'),
                 ('_SDC_SEQUENCE', 'NUMBER', 'YES'),
                 ('_SDC_TABLE_VERSION', 'NUMBER', 'YES'),
                 ('_SDC_TARGET_SNOWFLAKE_CREATE_TABLE_PLACEHOLDER', 'BOOLEAN',
                  'YES'), ('ADOPTION__ADOPTED_ON', 'TIMESTAMP_TZ', 'YES'),
                 ('ADOPTION__WAS_FOSTER', 'BOOLEAN', 'YES'),
                 ('AGE', 'NUMBER', 'YES'), ('ID', 'NUMBER', 'NO'),
                 ('NAME', 'TEXT', 'NO'), ('PAW_SIZE', 'NUMBER', 'NO'),
                 ('PAW_COLOUR', 'TEXT', 'NO'),
                 ('FLEA_CHECK_COMPLETE', 'BOOLEAN', 'NO'),
                 ('PATTERN', 'TEXT', 'YES')})

            assert_columns_equal(
                cur, 'CATS__ADOPTION__IMMUNIZATIONS',
                {('_SDC_LEVEL_0_ID', 'NUMBER', 'NO'),
                 ('_SDC_SEQUENCE', 'NUMBER', 'YES'),
                 ('_SDC_SOURCE_KEY_ID', 'NUMBER', 'NO'),
                 ('_SDC_TARGET_SNOWFLAKE_CREATE_TABLE_PLACEHOLDER', 'BOOLEAN',
                  'YES'), ('DATE_ADMINISTERED', 'TIMESTAMP_TZ', 'YES'),
                 ('TYPE', 'TEXT', 'YES')})

            assert_count_equal(cur, 'CATS', 0)
def test_multiple_batches__by_memory():
    stream = CatStream(100)

    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'],
                                         CATS_SCHEMA['key_properties'],
                                         max_buffer_size=1024)

    assert len(singer_stream.peek_buffer()) == 0

    while not singer_stream.buffer_full:
        singer_stream.add_record_message(stream.generate_record_message())

    assert len(singer_stream.peek_buffer()) == 1
    assert [] == missing_sdc_properties(singer_stream)

    singer_stream.flush_buffer()

    assert len(singer_stream.peek_buffer()) == 0
def test_loading__new_non_null_column(db_prep):
    cat_count = 50
    main(CONFIG, input_stream=CatStream(cat_count))

    class NonNullStream(CatStream):
        def generate_record(self):
            record = CatStream.generate_record(self)
            record['id'] = record['id'] + cat_count
            return record

    non_null_stream = NonNullStream(cat_count)
    non_null_stream.schema = deepcopy(non_null_stream.schema)
    non_null_stream.schema['schema']['properties']['paw_toe_count'] = {'type': 'integer',
                                                                       'default': 5}

    main(CONFIG, input_stream=non_null_stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            assert_columns_equal(cur,
                                 'CATS',
                                 {
                                     ('_SDC_BATCHED_AT', 'TIMESTAMP_TZ', 'YES'),
                                     ('_SDC_RECEIVED_AT', 'TIMESTAMP_TZ', 'YES'),
                                     ('_SDC_SEQUENCE', 'NUMBER', 'YES'),
                                     ('_SDC_TABLE_VERSION', 'NUMBER', 'YES'),
                                     ('_SDC_TARGET_SNOWFLAKE_CREATE_TABLE_PLACEHOLDER', 'BOOLEAN', 'YES'),
                                     ('ADOPTION__ADOPTED_ON', 'TIMESTAMP_TZ', 'YES'),
                                     ('ADOPTION__WAS_FOSTER', 'BOOLEAN', 'YES'),
                                     ('AGE', 'NUMBER', 'YES'),
                                     ('ID', 'NUMBER', 'NO'),
                                     ('NAME', 'TEXT', 'NO'),
                                     ('PAW_SIZE', 'NUMBER', 'NO'),
                                     ('PAW_COLOUR', 'TEXT', 'NO'),
                                     ('PAW_TOE_COUNT', 'NUMBER', 'YES'),
                                     ('FLEA_CHECK_COMPLETE', 'BOOLEAN', 'NO'),
                                     ('PATTERN', 'TEXT', 'YES')
                                 })

            cur.execute('''
                SELECT {}, {} FROM {}.{}.{}
            '''.format(
                sql.identifier('ID'),
                sql.identifier('PAW_TOE_COUNT'),
                sql.identifier(CONFIG['snowflake_database']),
                sql.identifier(CONFIG['snowflake_schema']),
                sql.identifier('CATS')
            ))

            persisted_records = cur.fetchall()

            ## Assert that the split columns before/after new non-null data
            assert 2 * cat_count == len(persisted_records)
            assert cat_count == len([x for x in persisted_records if x[1] is None])
            assert cat_count == len([x for x in persisted_records if x[1] is not None])
Example #20
0
def test_full_table_replication(db_cleanup):
    stream = CatStream(110, version=0, nested_count=3)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            version_0_count = cur.fetchone()[0]
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            version_0_sub_count = cur.fetchone()[0]
        assert_records(conn, stream.records, 'cats', 'id', match_pks=True)

    assert version_0_count == 110
    assert version_0_sub_count == 330

    stream = CatStream(100, version=1, nested_count=3)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            version_1_count = cur.fetchone()[0]
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            version_1_sub_count = cur.fetchone()[0]
        assert_records(conn, stream.records, 'cats', 'id', match_pks=True)

    assert version_1_count == 100
    assert version_1_sub_count == 300

    stream = CatStream(120, version=2, nested_count=2)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats'))
            version_2_count = cur.fetchone()[0]
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            version_2_sub_count = cur.fetchone()[0]
        assert_records(conn, stream.records, 'cats', 'id', match_pks=True)

    assert version_2_count == 120
    assert version_2_sub_count == 240
Example #21
0
def test_nested_delete_on_parent(db_cleanup):
    stream = CatStream(100, nested_count=3)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            high_nested = cur.fetchone()[0]
        assert_records(conn, stream.records, 'cats', 'id')

    stream = CatStream(100, nested_count=2)
    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('cats__adoption__immunizations'))
            low_nested = cur.fetchone()[0]
        assert_records(conn, stream.records, 'cats', 'id')

    assert low_nested < high_nested
def test_nested_delete_on_parent(db_prep):
    stream = CatStream(100, nested_count=3)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('CATS__ADOPTION__IMMUNIZATIONS'))
            high_nested = cur.fetchone()[0]
        assert_records(conn, stream.records, 'CATS', 'ID')

    stream = CatStream(100, nested_count=2)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_count_sql('CATS__ADOPTION__IMMUNIZATIONS'))
            low_nested = cur.fetchone()[0]
        assert_records(conn, stream.records, 'CATS', 'ID')

    assert low_nested < high_nested
Example #23
0
def test_multiple_batches_by_memory(db_cleanup):
    with patch.object(postgres.PostgresTarget,
                      'write_batch',
                      side_effect=mocked_mock_write_batch) as mock_write_batch:
        config = CONFIG.copy()
        config['max_batch_size'] = 1024
        config['batch_detection_threshold'] = 5

        stream = CatStream(100)
        main(config, input_stream=stream)

        assert mock_write_batch.call_count == 21
Example #24
0
def test_loading__new_non_null_column(db_cleanup):
    cat_count = 50
    main(CONFIG, input_stream=CatStream(cat_count))

    class NonNullStream(CatStream):
        def generate_record(self):
            record = CatStream.generate_record(self)
            record['id'] = record['id'] + cat_count
            return record

    non_null_stream = NonNullStream(cat_count)
    non_null_stream.schema = deepcopy(non_null_stream.schema)
    non_null_stream.schema['schema']['properties']['paw_toe_count'] = {
        'type': 'integer',
        'default': 5
    }

    main(CONFIG, input_stream=non_null_stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_columns_sql('cats'))
            columns = cur.fetchall()

            assert set(columns) == {
                ('_sdc_batched_at', 'timestamp with time zone', 'YES'),
                ('_sdc_received_at', 'timestamp with time zone', 'YES'),
                ('_sdc_sequence', 'bigint', 'YES'),
                ('_sdc_table_version', 'bigint', 'YES'),
                ('adoption__adopted_on', 'timestamp with time zone', 'YES'),
                ('adoption__was_foster', 'boolean', 'YES'),
                ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'),
                ('name', 'text', 'NO'), ('paw_size', 'bigint', 'NO'),
                ('paw_colour', 'text', 'NO'),
                ('paw_toe_count', 'bigint', 'YES'),
                ('flea_check_complete', 'boolean', 'NO'),
                ('pattern', 'text', 'YES')
            }

            cur.execute(
                sql.SQL('SELECT {}, {} FROM {}').format(
                    sql.Identifier('id'), sql.Identifier('paw_toe_count'),
                    sql.Identifier('cats')))

            persisted_records = cur.fetchall()

            ## Assert that the split columns before/after new non-null data
            assert 2 * cat_count == len(persisted_records)
            assert cat_count == len(
                [x for x in persisted_records if x[1] is None])
            assert cat_count == len(
                [x for x in persisted_records if x[1] is not None])
def test_init__empty_key_properties():
    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'], [])

    stream = CatStream(100)
    for _ in range(20):
        singer_stream.add_record_message(stream.generate_record_message())

    assert singer_stream
    assert [] == missing_sdc_properties(singer_stream)
    assert [SINGER_PK] == singer_stream.key_properties

    rows_missing_pk = []
    rows_checked = 0
    for r in singer_stream.get_batch():
        if not r[SINGER_PK]:
            rows_missing_pk.append(r)

        rows_checked += 1

    assert rows_checked > 1
    assert [] == rows_missing_pk
def test_state__emits_most_recent_state_when_final_flush_occurs(capsys):
    config = CONFIG.copy()
    config['max_batch_rows'] = 20
    config['batch_detection_threshold'] = 1
    rows = list(CatStream(5))
    rows.append(json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}))

    target_tools.stream_to_target(rows, Target(), config=config)

    # The final state message should have been outputted after the last records were loaded despite not reaching
    # one full flushable batch
    output = filtered_output(capsys)
    assert len(output) == 1
    assert json.loads(output[0])['value']['test'] == 'state-1'
def test_state__doesnt_emit_when_only_one_of_several_streams_is_flushing(
        capsys):
    config = CONFIG.copy()
    config['max_batch_rows'] = 20
    config['batch_detection_threshold'] = 1
    cat_rows = list(CatStream(100))
    dog_rows = list(DogStream(50))
    target = Target()

    # Simulate one stream that yields a lot of records with another that yields few records and ensure both need to be flushed
    # before any state messages are emitted
    def test_stream():
        yield cat_rows[0]
        yield dog_rows[0]
        for row in cat_rows[slice(1, 5)]:
            yield row
        for row in dog_rows[slice(1, 5)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})

        for row in cat_rows[slice(6, 45)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}})

        for row in cat_rows[slice(46, 65)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}})

        # After some state messages but before the batch size has been hit for both streams no state messages should have been emitted
        assert len(target.calls['write_batch']) == 3
        output = filtered_output(capsys)
        assert output == []

        for row in dog_rows[slice(6, 25)]:
            yield row
        yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}})

        # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted
        assert len(target.calls['write_batch']) == 4
        output = filtered_output(capsys)
        assert len(output) == 1
        assert json.loads(output[0])['value']['test'] == 'state-2'

    target_tools.stream_to_target(test_stream(), target, config=config)

    # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch
    output = filtered_output(capsys)
    assert len(output) == 1
    assert json.loads(output[0])['value']['test'] == 'state-4'
def test_upsert(db_prep):
    stream = CatStream(100)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            assert_count_equal(cur, 'CATS', 100)
        assert_records(conn, stream.records, 'CATS', 'ID')

    stream = CatStream(100)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            assert_count_equal(cur, 'CATS', 100)
        assert_records(conn, stream.records, 'CATS', 'ID')

    stream = CatStream(200)
    main(CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            assert_count_equal(cur, 'CATS', 200)
        assert_records(conn, stream.records, 'CATS', 'ID')
Example #29
0
def test_loading__invalid__column_type_change__pks():
    main(CONFIG, input_stream=CatStream(20))

    class StringIdCatStream(CatStream):
        def generate_record(self):
            record = CatStream.generate_record(self)
            record['id'] = str(record['id'])
            return record

    stream = StringIdCatStream(20)
    stream.schema = deepcopy(stream.schema)
    stream.schema['schema']['properties']['id'] = {'type': 'string'}

    with pytest.raises(postgres.PostgresError,
                       match=r'.*key_properties. type change detected'):
        main(CONFIG, input_stream=stream)
def test_loading__simple__s3_staging(db_prep):
    stream = CatStream(100)
    main(S3_CONFIG, input_stream=stream)

    with connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            assert_columns_equal(cur,
                                 'CATS',
                                 {
                                     ('_SDC_BATCHED_AT', 'TIMESTAMP_TZ', 'YES'),
                                     ('_SDC_RECEIVED_AT', 'TIMESTAMP_TZ', 'YES'),
                                     ('_SDC_SEQUENCE', 'NUMBER', 'YES'),
                                     ('_SDC_TABLE_VERSION', 'NUMBER', 'YES'),
                                     ('_SDC_TARGET_SNOWFLAKE_CREATE_TABLE_PLACEHOLDER', 'BOOLEAN', 'YES'),
                                     ('ADOPTION__ADOPTED_ON', 'TIMESTAMP_TZ', 'YES'),
                                     ('ADOPTION__WAS_FOSTER', 'BOOLEAN', 'YES'),
                                     ('AGE', 'NUMBER', 'YES'),
                                     ('ID', 'NUMBER', 'NO'),
                                     ('NAME', 'TEXT', 'NO'),
                                     ('PAW_SIZE', 'NUMBER', 'NO'),
                                     ('PAW_COLOUR', 'TEXT', 'NO'),
                                     ('FLEA_CHECK_COMPLETE', 'BOOLEAN', 'NO'),
                                     ('PATTERN', 'TEXT', 'YES')
                                 })

            assert_columns_equal(cur,
                                 'CATS__ADOPTION__IMMUNIZATIONS',
                                 {
                                     ('_SDC_LEVEL_0_ID', 'NUMBER', 'NO'),
                                     ('_SDC_SEQUENCE', 'NUMBER', 'YES'),
                                     ('_SDC_SOURCE_KEY_ID', 'NUMBER', 'NO'),
                                     ('_SDC_TARGET_SNOWFLAKE_CREATE_TABLE_PLACEHOLDER', 'BOOLEAN', 'YES'),
                                     ('DATE_ADMINISTERED', 'TIMESTAMP_TZ', 'YES'),
                                     ('TYPE', 'TEXT', 'YES')
                                 })

            assert_count_equal(cur, 'CATS', 100)

        for record in stream.records:
            record['paw_size'] = 314159
            record['paw_colour'] = ''
            record['flea_check_complete'] = False

        assert_records(conn, stream.records, 'CATS', 'ID')