def test_get_content_multiple_fields_merge(num_rows): fields = ['embedding', 'text'] batch_size = 10 embed_size = 20 kwargs = { field: np.random.random( (num_rows, embed_size)) if field == 'embedding' else 'text' for field in fields } docs = DocumentSet([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs._extract_docs(*fields) assert len(contents) == len(fields) assert isinstance(contents, list) assert isinstance(contents[0], np.ndarray) assert isinstance(contents[1], np.ndarray) for content in contents: assert len(content) == batch_size assert contents[0].shape == (batch_size, num_rows, embed_size) assert contents[1].shape == (batch_size, )
def test_batching_encode_text(encoder): docs = DocumentSet([Document(text=f'text-{i}') for i in range(15)]) texts, _ = docs._extract_docs('text') embeds = encoder.encode(texts) assert embeds.shape == (15, 10)
def test_batching_text_one_argument(crafter): docs = DocumentSet([Document(text=f'text-{i}') for i in range(15)]) texts, _ = docs._extract_docs('text') crafted_docs = crafter.craft(texts) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}-crafted'
def test_batching_encode_blob(encoder): docs = DocumentSet( [Document(blob=np.random.random((10, 20))) for _ in range(15)]) blob, _ = docs._extract_docs('blob') embeds = encoder.encode(blob) assert embeds.shape == (15, 10)
def test_batching_blob_one_argument(crafter): docs = DocumentSet( [Document(blob=np.array([[i] * 5, [i] * 5])) for i in range(15)]) texts, _ = docs._extract_docs('blob') crafted_docs = crafter.craft(texts) for i, crafted_doc in enumerate(crafted_docs): np.testing.assert_equal(crafted_doc['blob'], np.array([[i, i, i, i, i], [i, i, i, i, i]]))
def test_batching_text_one_argument(segmenter): docs = DocumentSet([Document(text=f'text-{i}') for i in range(15)]) texts, _ = docs._extract_docs('text') chunks_sets = segmenter.segment(texts) for i, chunks in enumerate(chunks_sets): assert len(chunks) == 3 for j, chunk in enumerate(chunks): assert chunk['text'] == f'text-{i}-chunk-{j}'
def test_batching_text_multi(crafter): docs = DocumentSet( [Document(text=f'text-{i}', id=f'id-{i}') for i in range(15)]) required_keys = ['text', 'id'] text_ids, _ = docs._extract_docs(*required_keys) # args = [text_ids[:, i] for i in range(len(required_keys))] crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}-crafted' assert crafted_doc['id'] == f'id-{i}-crafted'
def test_get_content(num_rows, field): batch_size = 10 embed_size = 20 kwargs = {field: np.random.random((num_rows, embed_size))} docs = DocumentSet([Document(**kwargs) for _ in range(batch_size)]) docs.append(Document()) contents, pts = docs._extract_docs(field) assert isinstance(contents, np.ndarray) assert contents.shape == (batch_size, num_rows, embed_size)
def test_batching_mix_multi(crafter): docs = DocumentSet([ Document(text=f'text-{i}', embedding=np.array([i, i, i, i, i])) for i in range(15) ]) required_keys = ['text', 'embedding'] text_ids, _ = docs._extract_docs(*required_keys) crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}' np.testing.assert_equal(crafted_doc['embedding'], np.array([i, i, i, i, i]))
def test_get_content_bytes_fields(field): batch_size = 10 kwargs = {field: b'bytes'} docs = DocumentSet([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs._extract_docs(field) assert contents.shape == (batch_size, ) assert len(contents) == batch_size assert isinstance(contents, np.ndarray) for content in contents: assert content == b'bytes'
def test_batching_blob_multi(crafter): docs = DocumentSet([ Document( blob=np.array([[i, i, i, i, i], [i, i, i, i, i]]), embedding=np.array([i, i, i, i, i]), ) for i in range(15) ]) required_keys = ['blob', 'embedding'] text_ids, _ = docs._extract_docs(*required_keys) crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): np.testing.assert_equal(crafted_doc['blob'], np.array([[i, i, i, i, i], [i, i, i, i, i]])) np.testing.assert_equal(crafted_doc['embedding'], np.array([i, i, i, i, i]))
def test_get_content_multiple_fields_text(fields): batch_size = 10 kwargs = {field: f'text-{field}' for field in fields} docs = DocumentSet([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs._extract_docs(*fields) assert len(contents) == len(fields) assert isinstance(contents, list) assert isinstance(contents[0], np.ndarray) assert isinstance(contents[1], np.ndarray) for content in contents: assert len(content) == batch_size assert content.shape == (batch_size, )