def test__ensure_list_no_pandas(self): class TestCase: def __init__(self, docs_in, expected_docs): self.docs_in = docs_in self.expected_docs = expected_docs tests_success = { 'dict': TestCase( docs_in=self.docs[0], expected_docs=[self.docs[0]], ), 'list': TestCase( docs_in=self.docs, expected_docs=self.docs, ), } tests_fail = { 'string input': TestCase( docs_in='docs', expected_docs=None, ), 'set input': TestCase( docs_in={'docs'}, expected_docs=None, ), } for test_name, test in tests_success.items(): docs_out = ElasticBuffer._ensure_list(test.docs_in) self.assertListEqual(docs_out, test.expected_docs, test_name) for test_name, test in tests_fail.items(): with self.assertRaises(ValueError, msg=test_name): _ = ElasticBuffer._ensure_list(test.docs_in)
def __init__(self, n_success=0, bulk_errs=None, side_effect=None): self.eb = ElasticBuffer() self.eb._buffer = TestElasticBuffer.docs self.eb._oldest_doc_timestamp = TestElasticBuffer.timestamp self.return_value = (n_success, bulk_errs) self.side_effect = side_effect
def test__to_file(self, mocked_file): dump_dir = '/tmp' eb = ElasticBuffer(dump_dir=dump_dir) expected_dump_file = os.path.join( dump_dir, f'{eb.__class__.__name__}_buffer_dump_{self.timestamp}', ) eb.add(self.docs) eb._to_file(timestamp=self.timestamp) mocked_file.assert_called_once_with(expected_dump_file, 'w') self.assertEqual(mocked_file().write.call_count, len(self.docs), 'write should be called once for every document') expected_write_call_args = [ json.dumps(doc) + '\n' for doc in self.docs ] write_call_args = [ arg[0][0] for arg in mocked_file().write.call_args_list ] self.assertListEqual( write_call_args, expected_write_call_args, 'write should be called with each document (json serialized and newline)' )
def test_context_error(self, mock_flush): class TestCase: def __init__(self, n_docs, buffer_size, n_expected_flush_calls): self.n_docs = n_docs self.buffer_size = buffer_size self.n_expected_flush_calls = n_expected_flush_calls tests = { 'flush is not called on exit due to exception with empty buffer': TestCase( n_docs=0, buffer_size=10, n_expected_flush_calls=0, ), 'flush is not called on exit due to exception with populated buffer': TestCase( n_docs=5, buffer_size=10, n_expected_flush_calls=0, ), 'flush is called once when buffer is full but not again on exit': TestCase( n_docs=5, buffer_size=2, n_expected_flush_calls=1, ), } default_err = ValueError for test_name, test in tests.items(): mock_flush.reset_mock() mock_flush.side_effect = ElasticBufferFlushError err = ElasticBufferFlushError if test.n_expected_flush_calls > 0 else default_err docs = [self.docs[0]] * test.n_docs with self.assertRaises(err, msg=test_name): with ElasticBuffer(size=test.buffer_size) as eb: eb.add(docs) raise default_err( ) # only raised when eb.add does not result in an Exception self.assertEqual(mock_flush.call_count, test.n_expected_flush_calls, test_name)
def test_context_success(self, mock_flush): class TestCase: def __init__(self, n_docs, buffer_size, n_expected_flush_calls): self.n_docs = n_docs self.buffer_size = buffer_size self.n_expected_flush_calls = n_expected_flush_calls tests = { 'flush is called on exit with empty buffer': TestCase( n_docs=0, buffer_size=10, n_expected_flush_calls=1, ), 'flush is called on exit with populated buffer': TestCase( n_docs=5, buffer_size=10, n_expected_flush_calls=1, ), 'flush is called once when buffer is full and once on exit': TestCase( n_docs=5, buffer_size=2, n_expected_flush_calls=2, ), } for test_name, test in tests.items(): mock_flush.reset_mock() docs = ['a'] * test.n_docs with ElasticBuffer(size=test.buffer_size) as eb: mock_flush.side_effect = eb._clear_buffer eb.add(docs) self.assertEqual(mock_flush.call_count, test.n_expected_flush_calls, test_name)
def __init__( self, documents, documents_timestamp, expected_buffer, expected_oldest_doc_timestamp, expected_flush_called, buffer_size=10, # used for multiple adds more_documents=None, more_documents_timestamp=None, ): self.documents = documents self.documents_timestamp = documents_timestamp self.expected_buffer = expected_buffer self.expected_oldest_doc_timestamp = expected_oldest_doc_timestamp self.expected_flush_called = expected_flush_called self.more_documents = [] if more_documents is None else more_documents self.more_documents_timestamp = \ None if more_documents_timestamp is None else more_documents_timestamp self.eb = ElasticBuffer(size=buffer_size)
def test_flush_success(self, mock_bulk): mock_bulk.return_value = (len(self.docs), []) eb = ElasticBuffer() eb._buffer = self.docs eb._oldest_doc_timestamp = self.timestamp eb.flush() # assert contents of buffer were passed to bulk (_, called_docs), _ = mock_bulk.call_args self.assertListEqual( called_docs, self.docs, 'contents of buffer should have been passed to bulk') # assert state was cleared self.assertListEqual(eb._buffer, [], 'buffer should be empty after successful insert') self.assertIsNone(eb._oldest_doc_timestamp, 'timestamp should be None after successful insert')
def __init__(self, n_items): self.n_items = n_items self.eb = ElasticBuffer() if n_items > 0: self.eb._buffer = ['a'] * n_items
def __init__(self, buf, timestamp): self.eb = ElasticBuffer() if buf: self.eb._buffer = buf if timestamp: self.eb._oldest_doc_timestamp = timestamp
def __init__(self, oldest_doc_timestamp, timestamp, expected=None): self.timestamp = timestamp self.expected = expected self.eb = ElasticBuffer() self.eb._oldest_doc_timestamp = oldest_doc_timestamp
def test__ensure_list_with_pandas(self): series_list = [doc['c'] for doc in self.docs] class TestCase: def __init__(self, docs_in, expected_docs): self.docs_in = docs_in self.expected_docs = expected_docs tests = { 'series': TestCase( docs_in=pd.Series(series_list), expected_docs=[{ 0: item } for item in series_list], ), 'named series': TestCase( docs_in=pd.Series(series_list).rename('my_name'), expected_docs=[{ 'my_name': item } for item in series_list], ), 'series with named index': TestCase( docs_in=pd.Series(series_list).rename_axis('my_axis', axis=0), expected_docs=[{ 0: item, 'my_axis': i } for i, item in enumerate(series_list)], ), 'named series with named index': TestCase( docs_in=pd.Series(series_list).rename_axis( 'my_axis', axis=0).rename('my_name'), expected_docs=[{ 'my_name': item, 'my_axis': i } for i, item in enumerate(series_list)], ), 'series with single row': TestCase( docs_in=pd.Series(series_list[0]), expected_docs=[{ 0: item } for item in [series_list[0]]], ), 'named series with single row': TestCase( docs_in=pd.Series(series_list[0]).rename('my_name'), expected_docs=[{ 'my_name': item } for item in [series_list[0]]], ), 'named series with single row with named index': TestCase( docs_in=pd.Series(series_list[0]).rename_axis( 'my_index', axis=0).rename('my_name'), expected_docs=[{ 'my_name': item, 'my_index': i } for i, item in enumerate([series_list[0]])], ), 'dataframe': TestCase( docs_in=pd.DataFrame(self.docs), expected_docs=self.docs, ), 'dataframe with named index': TestCase( docs_in=pd.DataFrame(self.docs).set_index('c'), expected_docs=self.docs, ), 'dataframe with single row': TestCase( docs_in=pd.DataFrame(self.docs[0], index=[0]), expected_docs=[self.docs[0]], ), 'dataframe with single row and named index': TestCase( docs_in=pd.DataFrame(self.docs[0], index=[0]).set_index('c'), expected_docs=[self.docs[0]], ), } for test_name, test in tests.items(): docs_out = ElasticBuffer._ensure_list(test.docs_in) self.assertListEqual(docs_out, test.expected_docs, test_name)
def test__apply_metadata_funcs(self): class TestCase: def __init__(self, docs_in, metadata_funcs, expected_docs): self.docs_in = docs_in self.metadata_funcs = metadata_funcs self.expected_docs = expected_docs def _index(doc): return 'my-index' def _id(doc): return sum(doc.values()) tests = { 'no metadata funcs': TestCase( docs_in=[ { 'a': 1, 'b': 2 }, { 'a': 8, 'b': 9 }, ], metadata_funcs={}, expected_docs=[ { 'a': 1, 'b': 2 }, { 'a': 8, 'b': 9 }, ], ), 'single metadata func': TestCase( docs_in=[ { 'a': 1, 'b': 2 }, { 'a': 8, 'b': 9 }, ], metadata_funcs={ '_index': _index, }, expected_docs=[ { 'a': 1, 'b': 2, '_index': 'my-index' }, { 'a': 8, 'b': 9, '_index': 'my-index' }, ], ), 'multiple metadata funcs': TestCase( docs_in=[ { 'a': 1, 'b': 2 }, { 'a': 8, 'b': 9 }, ], metadata_funcs={ '_index': _index, '_id': _id, }, expected_docs=[ { 'a': 1, 'b': 2, '_index': 'my-index', '_id': 3 }, { 'a': 8, 'b': 9, '_index': 'my-index', '_id': 17 }, ], ), } for test_name, test in tests.items(): eb = ElasticBuffer(**test.metadata_funcs) docs_out = eb._apply_metadata_funcs(test.docs_in) self.assertListEqual(docs_out, test.expected_docs, test_name)
def test_flush_empty_buffer(self, mock_bulk): eb = ElasticBuffer() eb.flush() mock_bulk.assert_not_called()