def test_multiple_batches__old_records__by_rows(): stream_oldest = CatStream(100, version=0) stream_middle_aged = CatStream(100, version=5) stream_latest = CatStream(100, version=10) singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], CATS_SCHEMA['schema'], CATS_SCHEMA['key_properties'], max_rows=20) assert len(singer_stream.peek_buffer()) == 0 while not singer_stream.buffer_full: singer_stream.add_record_message( stream_oldest.generate_record_message()) assert len(singer_stream.peek_buffer()) == 20 singer_stream.flush_buffer() assert len(singer_stream.peek_buffer()) == 0 singer_stream.add_record_message(stream_latest.generate_record_message()) assert len(singer_stream.peek_buffer()) == 1 reasonable_cutoff = 1000 while not singer_stream.buffer_full and reasonable_cutoff != 0: singer_stream.add_record_message( stream_middle_aged.generate_record_message()) reasonable_cutoff -= 1 assert reasonable_cutoff == 0 assert len(singer_stream.peek_buffer()) == 1 assert [] == missing_sdc_properties(singer_stream)
def test_state__doesnt_emit_when_it_isnt_different_than_the_previous_emission(capsys): config = CONFIG.copy() config['max_batch_rows'] = 5 config['batch_detection_threshold'] = 1 rows = list(CatStream(100)) target = Target() def test_stream(): yield rows[0] for row in rows[slice(1, 21)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) output = filtered_output(capsys) assert len(output) == 1 for row in rows[slice(22, 99)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) output = filtered_output(capsys) assert len(output) == 0 target_tools.stream_to_target(test_stream(), target, config=config) output = filtered_output(capsys) assert len(output) == 0
def test_state__emits_when_multiple_streams_are_registered_but_records_arrive_from_only_one(capsys): config = CONFIG.copy() config['max_batch_rows'] = 20 config['batch_detection_threshold'] = 1 cat_rows = list(CatStream(100)) dog_rows = list(DogStream(50)) target = Target() # Simulate one stream that yields a lot of records with another that yields no records, and ensure that only the first # needs to be flushed before any state messages are emitted def test_stream(): yield cat_rows[0] yield dog_rows[0] for row in cat_rows[slice(1, 5)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) for row in cat_rows[slice(6, 25)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}}) # After some state messages and only one of the registered streams has hit the batch size, the state message should be emitted, as there are no unflushed records from the other stream yet assert len(target.calls['write_batch']) == 1 output = filtered_output(capsys) assert len(output) == 1 assert json.loads(output[0])['test'] == 'state-1' target_tools.stream_to_target(test_stream(), target, config=config) # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch output = filtered_output(capsys) assert len(output) == 1 assert json.loads(output[0])['test'] == 'state-2'
def test_add_record_message(): stream = CatStream(10) singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], CATS_SCHEMA['schema'], CATS_SCHEMA['key_properties']) assert singer_stream.add_record_message( stream.generate_record_message()) is None assert not singer_stream.peek_invalid_records() assert [] == missing_sdc_properties(singer_stream)
def test_state__emits_most_recent_state_when_final_flush_occurs(capsys): config = CONFIG.copy() config['max_batch_rows'] = 20 config['batch_detection_threshold'] = 1 rows = list(CatStream(5)) rows.append(json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}})) target_tools.stream_to_target(rows, Target(), config=config) # The final state message should have been outputted after the last records were loaded despite not reaching # one full flushable batch output = filtered_output(capsys) assert len(output) == 1 assert json.loads(output[0])['test'] == 'state-1'
def test_state__doesnt_emit_when_only_one_of_several_streams_is_flushing( capsys): config = CONFIG.copy() config['max_batch_rows'] = 20 config['batch_detection_threshold'] = 1 cat_rows = list(CatStream(100)) dog_rows = list(DogStream(50)) target = Target() # Simulate one stream that yields a lot of records with another that yields few records and ensure both need to be flushed # before any state messages are emitted def test_stream(): yield cat_rows[0] yield dog_rows[0] for row in cat_rows[slice(1, 5)]: yield row for row in dog_rows[slice(1, 5)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) for row in cat_rows[slice(6, 45)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}}) for row in cat_rows[slice(46, 65)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}}) # After some state messages but before the batch size has been hit for both streams no state messages should have been emitted assert len(target.calls['write_batch']) == 3 output = filtered_output(capsys) assert output == [] for row in dog_rows[slice(6, 25)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}}) # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted assert len(target.calls['write_batch']) == 4 output = filtered_output(capsys) assert len(output) == 1 assert json.loads(output[0])['value']['test'] == 'state-2' target_tools.stream_to_target(test_stream(), target, config=config) # The final state message should have been outputted after the last dog records were loaded despite not reaching one full flushable batch output = filtered_output(capsys) assert len(output) == 1 assert json.loads(output[0])['value']['test'] == 'state-4'
def test_state__emits_only_messages_when_all_records_before_have_been_flushed( capsys): config = CONFIG.copy() config['max_batch_rows'] = 20 config['batch_detection_threshold'] = 1 rows = list(CatStream(100)) target = Target() def test_stream(): yield rows[0] for row in rows[slice(1, 5)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-1'}}) for row in rows[slice(6, 10)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-2'}}) for row in rows[slice(11, 15)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-3'}}) # After some state messages but before the batch size has been hit no state messages should have been emitted assert len(target.calls['write_batch']) == 0 output = filtered_output(capsys) assert output == [] for row in rows[slice(16, 25)]: yield row yield json.dumps({'type': 'STATE', 'value': {'test': 'state-4'}}) # After the batch size has been hit and a write_batch call was made, the most recent safe to emit state should have been emitted assert len(target.calls['write_batch']) == 1 output = filtered_output(capsys) assert len(output) == 1 assert json.loads(output[0])['value']['test'] == 'state-3' for row in rows[slice(26, 31)]: yield row target_tools.stream_to_target(test_stream(), target, config=config) # The final state message should have been outputted after the last records were loaded output = filtered_output(capsys) assert len(output) == 1 assert json.loads(output[0])['value']['test'] == 'state-4'
def test_multiple_batches__by_memory(): stream = CatStream(100) singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], CATS_SCHEMA['schema'], CATS_SCHEMA['key_properties'], max_buffer_size=10) assert len(singer_stream.peek_buffer()) == 0 while not singer_stream.buffer_full: singer_stream.add_record_message(stream.generate_record_message()) assert len(singer_stream.peek_buffer()) == 1 assert [] == missing_sdc_properties(singer_stream) singer_stream.flush_buffer() assert len(singer_stream.peek_buffer()) == 0
def test_init__empty_key_properties(): singer_stream = BufferedSingerStream(CATS_SCHEMA['stream'], CATS_SCHEMA['schema'], []) stream = CatStream(100) for _ in range(20): singer_stream.add_record_message(stream.generate_record_message()) assert singer_stream assert [] == missing_sdc_properties(singer_stream) assert [singer.PK] == singer_stream.key_properties rows_missing_pk = [] rows_checked = 0 for r in singer_stream.get_batch(): if not r[singer.PK]: rows_missing_pk.append(r) rows_checked += 1 assert rows_checked > 1 assert [] == rows_missing_pk