Ejemplo n.º 1
0
def test_init():
    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'],
                                         CATS_SCHEMA['key_properties'])

    assert singer_stream
    assert [] == missing_sdc_properties(singer_stream)
Ejemplo n.º 2
0
def test_add_record_message__allOf__impossible_schema():
    stream_name = 'test'

    schema = deepcopy(SIMPLE_ALLOF_SCHEMA)
    schema['properties']['allOfKey']['allOf'].append({'type': ['number']})

    singer_stream = BufferedSingerStream(stream_name, schema, [])

    with pytest.raises(SingerStreamError):
        singer_stream.add_record_message({
            'type': 'RECORD',
            'stream': stream_name,
            'record': {
                'allOfKey': 'short'
            },
            'sequence': 0
        })
    with pytest.raises(SingerStreamError):
        singer_stream.add_record_message({
            'type': 'RECORD',
            'stream': stream_name,
            'record': {
                'allOfKey': 314159
            },
            'sequence': 0
        })

    assert singer_stream.peek_invalid_records()
    assert singer_stream.count == 0
    assert [] == missing_sdc_properties(singer_stream)
Ejemplo n.º 3
0
def test_add_record_message__allOf__invalid_record():
    stream_name = 'test'
    singer_stream = BufferedSingerStream(stream_name,
                                         deepcopy(SIMPLE_ALLOF_SCHEMA), [])

    with pytest.raises(SingerStreamError):
        singer_stream.add_record_message({
            'type': 'RECORD',
            'stream': stream_name,
            'record': {
                'allOfKey':
                'this is a string which is much too long to be allowed'
            },
            'sequence': 0
        })

    assert singer_stream.peek_invalid_records()
    assert singer_stream.count == 0
    assert [] == missing_sdc_properties(singer_stream)
Ejemplo n.º 4
0
def test_add_record_message__allOf():
    stream_name = 'test'
    singer_stream = BufferedSingerStream(stream_name,
                                         deepcopy(SIMPLE_ALLOF_SCHEMA), [])

    strs_shorter_than_6 = ['hello', 'I', 'am', 'a set', 'of', 'short', 'strs']

    for string in strs_shorter_than_6:
        singer_stream.add_record_message({
            'type': 'RECORD',
            'stream': stream_name,
            'record': {
                'allOfKey': string
            },
            'sequence': 0
        })

    assert not singer_stream.peek_invalid_records()
    assert singer_stream.count == len(strs_shorter_than_6)
    assert [] == missing_sdc_properties(singer_stream)
Ejemplo n.º 5
0
def test_add_record_message__multipleOf_invalid_record():
    stream_name = 'test'
    singer_stream = BufferedSingerStream(
        stream_name, deepcopy(SIMPLE_MULTIPLE_OF_INVALID_SCHEMA), [])

    multiple_of_values = [1, 2]

    for value in multiple_of_values:
        with pytest.raises(SingerStreamError):
            singer_stream.add_record_message({
                'type': 'RECORD',
                'stream': stream_name,
                'record': {
                    'multipleOfKey': value
                },
                'sequence': 0,
                RAW_LINE_SIZE: 100
            })

    assert singer_stream.peek_invalid_records()
    assert singer_stream.count == 0
def test_init__empty_key_properties():
    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'], [])

    stream = CatStream(100)
    for _ in range(20):
        singer_stream.add_record_message(stream.generate_record_message())

    assert singer_stream
    assert [] == missing_sdc_properties(singer_stream)
    assert [SINGER_PK] == singer_stream.key_properties

    rows_missing_pk = []
    rows_checked = 0
    for r in singer_stream.get_batch():
        if not r[SINGER_PK]:
            rows_missing_pk.append(r)

        rows_checked += 1

    assert rows_checked > 1
    assert [] == rows_missing_pk
Ejemplo n.º 7
0
def test_add_record_message__multipleOf():
    stream_name = 'test'
    singer_stream = BufferedSingerStream(
        stream_name, deepcopy(SIMPLE_MULTIPLE_OF_VALID_SCHEMA), [])

    multiple_of_values = [
        '1', '2', '3', '4', '5', '1.1', '2.3', '1.23456789', '20', '100.1'
    ]

    for value in multiple_of_values:
        singer_stream.add_record_message({
            'type': 'RECORD',
            'stream': stream_name,
            'record': {
                'multipleOfKey': Decimal(value)
            },
            'sequence': 0,
            RAW_LINE_SIZE: 100
        })

    assert not singer_stream.peek_invalid_records()
    assert singer_stream.count == len(multiple_of_values)
def test_multiple_batches__by_memory():
    stream = CatStream(100)

    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'],
                                         CATS_SCHEMA['key_properties'],
                                         max_buffer_size=1024)

    assert len(singer_stream.peek_buffer()) == 0

    while not singer_stream.buffer_full:
        singer_stream.add_record_message(stream.generate_record_message())

    assert len(singer_stream.peek_buffer()) == 1
    assert [] == missing_sdc_properties(singer_stream)

    singer_stream.flush_buffer()

    assert len(singer_stream.peek_buffer()) == 0
def test_add_record_message__invalid_record__cross_threshold():
    stream = InvalidCatStream(10)

    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'],
                                         CATS_SCHEMA['key_properties'],
                                         invalid_records_threshold=3)

    singer_stream.add_record_message(stream.generate_record_message())
    singer_stream.add_record_message(stream.generate_record_message())

    with pytest.raises(SingerStreamError):
        singer_stream.add_record_message(stream.generate_record_message())

    assert singer_stream.peek_invalid_records()
    assert singer_stream.count == 0
    assert [] == missing_sdc_properties(singer_stream)
Ejemplo n.º 10
0
def _line_handler(state_tracker, target, invalid_records_detect,
                  invalid_records_threshold, max_batch_rows, max_batch_size,
                  line):
    try:
        line_data = json.loads(line)
    except json.decoder.JSONDecodeError:
        LOGGER.error("Unable to parse JSON: {}".format(line))
        raise

    if 'type' not in line_data:
        raise TargetError('`type` is a required key: {}'.format(line))

    if line_data['type'] == 'SCHEMA':
        if 'stream' not in line_data:
            raise TargetError('`stream` is a required key: {}'.format(line))

        stream = line_data['stream']

        if 'schema' not in line_data:
            raise TargetError('`schema` is a required key: {}'.format(line))

        schema = line_data['schema']

        schema_validation_errors = json_schema.validation_errors(schema)
        if schema_validation_errors:
            raise TargetError(
                '`schema` is an invalid JSON Schema instance: {}'.format(line),
                *schema_validation_errors)

        if 'key_properties' in line_data:
            key_properties = line_data['key_properties']
        else:
            key_properties = None

        if stream not in state_tracker.streams:
            buffered_stream = BufferedSingerStream(
                stream,
                schema,
                key_properties,
                invalid_records_detect=invalid_records_detect,
                invalid_records_threshold=invalid_records_threshold)
            if max_batch_rows:
                buffered_stream.max_rows = max_batch_rows
            if max_batch_size:
                buffered_stream.max_buffer_size = max_batch_size

            state_tracker.register_stream(stream, buffered_stream)
        else:
            state_tracker.streams[stream].update_schema(schema, key_properties)
    elif line_data['type'] == 'RECORD':
        if 'stream' not in line_data:
            raise TargetError('`stream` is a required key: {}'.format(line))

        state_tracker.handle_record_message(line_data['stream'], line_data)
    elif line_data['type'] == 'ACTIVATE_VERSION':
        if 'stream' not in line_data:
            raise TargetError('`stream` is a required key: {}'.format(line))
        if 'version' not in line_data:
            raise TargetError('`version` is a required key: {}'.format(line))
        if line_data['stream'] not in state_tracker.streams:
            raise TargetError(
                'A ACTIVATE_VERSION for stream {} was encountered before a corresponding schema'
                .format(line_data['stream']))

        stream_buffer = state_tracker.streams[line_data['stream']]
        state_tracker.flush_stream(line_data['stream'])
        target.activate_version(stream_buffer, line_data['version'])
    elif line_data['type'] == 'STATE':
        state_tracker.handle_state_message(line_data)
    else:
        raise TargetError('Unknown message type {} in message {}'.format(
            line_data['type'], line))
def test_multiple_batches__old_records__by_memory():
    stream_oldest = CatStream(100, version=0)
    stream_middle_aged = CatStream(100, version=5)
    stream_latest = CatStream(100, version=10)

    singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'],
                                         CATS_SCHEMA['schema'],
                                         CATS_SCHEMA['key_properties'],
                                         max_buffer_size=32768)

    assert len(singer_stream.peek_buffer()) == 0

    while not singer_stream.buffer_full:
        singer_stream.add_record_message(
            stream_oldest.generate_record_message())

    assert len(singer_stream.peek_buffer()) > 0
    assert [] == missing_sdc_properties(singer_stream)

    singer_stream.flush_buffer()

    assert len(singer_stream.peek_buffer()) == 0

    singer_stream.add_record_message(stream_latest.generate_record_message())

    assert len(singer_stream.peek_buffer()) == 1

    reasonable_cutoff = 1000
    while not singer_stream.buffer_full and reasonable_cutoff != 0:
        singer_stream.add_record_message(
            stream_middle_aged.generate_record_message())
        reasonable_cutoff -= 1

    assert reasonable_cutoff == 0
    assert len(singer_stream.peek_buffer()) == 1
    assert [] == missing_sdc_properties(singer_stream)