def test_read_bulk_smaller_than_number_of_docs_and_multiple_clients(self): data = [ '{"key": "value1"}', '{"key": "value2"}', '{"key": "value3"}', '{"key": "value4"}', '{"key": "value5"}', '{"key": "value6"}', '{"key": "value7"}', ] bulk_size = 3 # only 5 documents to index for this client source = params.Slice(io.StringAsFileSource, 0, 5) am_handler = params.GenerateActionMetaData("test_index", "test_type", conflicting_ids=None) reader = params.IndexDataReader(data, batch_size=bulk_size, bulk_size=bulk_size, file_source=source, action_metadata=am_handler, index_name="test_index", type_name="test_type") expected_bulk_sizes = [3, 2] # lines should include meta-data expected_line_sizes = [6, 4] self.assert_bulks_sized(reader, expected_bulk_sizes, expected_line_sizes)
def test_generate_action_meta_data_without_id_conflicts(self): self.assertEqual( '{"index": {"_index": "test_index", "_type": "test_type"}}', next( params.GenerateActionMetaData("test_index", "test_type", conflicting_ids=None)))
def test_read_bulk_with_offset(self): data = [ '{"key": "value1"}', '{"key": "value2"}', '{"key": "value3"}', '{"key": "value4"}', '{"key": "value5"}' ] bulk_size = 50 source = params.Slice(io.StringAsFileSource, 3, len(data)) am_handler = params.GenerateActionMetaData("test_index", "test_type", conflicting_ids=None) reader = params.IndexDataReader(data, batch_size=bulk_size, bulk_size=bulk_size, file_source=source, action_metadata=am_handler, index_name="test_index", type_name="test_type") expected_bulk_sizes = [(len(data) - 3)] # lines should include meta-data expected_line_sizes = [(len(data) - 3) * 2] self.assert_bulks_sized(reader, expected_bulk_sizes, expected_line_sizes)
def test_read_bulk_smaller_than_number_of_docs_and_multiple_clients(self): data = [ '{"key": "value1"}', '{"key": "value2"}', '{"key": "value3"}', '{"key": "value4"}', '{"key": "value5"}', '{"key": "value6"}', '{"key": "value7"}', ] bulk_size = 3 # only 5 documents to index for this client source = params.Slice(StringAsFileSource, 0, 5) am_handler = params.GenerateActionMetaData("test_index", "test_type", conflicting_ids=None) reader = params.IndexDataReader(data, batch_size=bulk_size, bulk_size=bulk_size, file_source=source, action_metadata=am_handler, index_name="test_index", type_name="test_type") # always double the amount as one line contains the data and one line contains the index command expected_bulk_sizes = [6, 4] self.assert_bulks_sized(reader, expected_bulk_sizes)
def create_reader(bulk_size): metadata = params.GenerateActionMetaData(index_name="test-idx", type_name=None) source = params.Slice(StaticSource, 0, sys.maxsize) reader = params.MetadataIndexDataReader(data_file="bogus", batch_size=bulk_size, bulk_size=bulk_size, file_source=source, action_metadata=metadata, index_name="test-idx", type_name=None) return reader
def test_read_bulk_larger_than_number_of_docs(self): data = [ '{"key": "value1"}', '{"key": "value2"}', '{"key": "value3"}', '{"key": "value4"}', '{"key": "value5"}' ] bulk_size = 50 source = params.Slice(io.StringAsFileSource, 0, len(data)) am_handler = params.GenerateActionMetaData("test_index", "test_type", conflicting_ids=None) reader = params.IndexDataReader(data, batch_size=bulk_size, bulk_size=bulk_size, file_source=source, action_metadata=am_handler, index_name="test_index", type_name="test_type") expected_bulk_sizes = [len(data) * 2] self.assert_bulks_sized(reader, expected_bulk_sizes)
def test_generate_action_meta_data_with_id_conflicts(self): pseudo_random_sequence = iter([ # first column == 3 -> we'll draw a "random" id, second column == "random" id 3, 1, 3, 3, 3, 2, 0, 3, 0 ]) generator = params.GenerateActionMetaData( "test_index", "test_type", conflicting_ids=[100, 200, 300, 400], rand=lambda x, y: next(pseudo_random_sequence)) # first one is always not drawn from a random index self.assertEqual( '{"index": {"_index": "test_index", "_type": "test_type", "_id": "100"}}', next(generator)) # now we start using random ids, i.e. look in the first line of the pseudo-random sequence self.assertEqual( '{"index": {"_index": "test_index", "_type": "test_type", "_id": "200"}}', next(generator)) self.assertEqual( '{"index": {"_index": "test_index", "_type": "test_type", "_id": "400"}}', next(generator)) self.assertEqual( '{"index": {"_index": "test_index", "_type": "test_type", "_id": "300"}}', next(generator)) # "random" returns 0 instead of 3 -> we draw the next sequential one, which is 200 self.assertEqual( '{"index": {"_index": "test_index", "_type": "test_type", "_id": "200"}}', next(generator)) # and we're back to random self.assertEqual( '{"index": {"_index": "test_index", "_type": "test_type", "_id": "100"}}', next(generator))