def _create_test_data_message(counter=0): req = list(request_generator('/', DocumentArray([Document(text=str(counter))])))[0] msg = Message(None, req, 'test', '123') return msg
def input_docs(): return DocumentArray([Document() for _ in range(50)])
def foo(self, docs: DocumentArray, **kwargs): docs.append(Document(text=str(self.shard_id))) return docs
def da_and_dam(N): da = DocumentArray.empty(N) dam = DocumentArrayMemmap.empty(N) return da, dam
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._docs = DocumentArray() self.logger = JinaLogger('IndexExecutor')
def docs(self): return DocumentArray(list(random_docs(10)))
def test_set_embeddings_multi_kind(array): da = DocumentArray([Document() for _ in range(10)]) da.embeddings = array
def test_da_get_embeddings_slice(): da = DocumentArray(random_docs(100)) np.testing.assert_almost_equal( da.get_attributes('embedding')[10:20], da._get_embeddings(slice(10, 20)) )
def docarray_for_cache(): da = DocumentArray() d1 = Document(id=1) d2 = Document(id='2') da.extend([d1, d2]) return da
def test_traversal_path(): da = DocumentArray([Document() for _ in range(6)]) assert len(da) == 6 da.traverse_flat(['r']) with pytest.raises(ValueError): da.traverse_flat('r') da.traverse(['r']) with pytest.raises(ValueError): for _ in da.traverse('r'): pass da.traverse(['r']) with pytest.raises(ValueError): for _ in da.traverse('r'): pass
def test_da_get_embeddings(): da = DocumentArray(random_docs(100)) np.testing.assert_almost_equal(da.get_attributes('embedding'), da.embeddings)
def docarray(docs): return DocumentArray(docs)
def test_delete_by_id(docarray: DocumentArray, document_factory): doc = document_factory.create(4, 'test 4') docarray.append(doc) del docarray[doc.id] assert len(docarray) == 3 assert docarray == docarray
def fake_reduce(self, **kwargs): return DocumentArray([Document(id='fake_document')])
def _create_test_data_message(counter=0): return list( request_generator('/', DocumentArray([Document(text=str(counter))])))[0]
def test_blobs_getter_da(): blobs = np.random.random((100, 10, 10)) da = DocumentArray([Document(blob=blob) for blob in blobs]) assert len(da) == 100 np.testing.assert_almost_equal(da.get_attributes('blob'), da.blobs)
def foo(self, docs, **kwargs): self._count += 1 current_count = self._count if current_count % 2 == 0: time.sleep(0.1) return DocumentArray([Document(text=str(current_count))])
def index(self, docs: DocumentArray, **kwargs): self.db.insert_multiple(docs.get_attributes('tags'))
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._docs = DocumentArray() self.logger = JinaLogger('KeyValueDBMSIndexer')
def filter(self, docs: DocumentArray, **kwargs): filtered_docs = DocumentArray(d for d in docs.traverse_flat(['c']) if d.mime_type == 'text/plain') return filtered_docs
def segmenter_doc_array(): inputs = [ Document(tags={'caption': 'hello', 'image': '1.jpg'}), Document(tags={'caption': 'world', 'image': '2.jpg'}), ] return DocumentArray(inputs)
def craft(self, docs, *args, **kwargs): tmp_dir = os.environ.get('TEST_EVAL_FLOW_TMPDIR') with open(f'{tmp_dir}/{self.tag}.txt', 'a') as fp: fp.write(f'{docs[0].id}') return None class DummyEvaluator2(DummyEvaluator1): tag = 2 class DummyEvaluator3(DummyEvaluator1): tag = 3 docs = DocumentArray([x for x in random_docs(1)]) params = ['HANG', 'REMOVE', 'COLLECT'] def validate(ids, expect): assert len(ids) > 0 for j in ids: tmp_dir = os.environ.get('TEST_EVAL_FLOW_TMPDIR') fname = f'{tmp_dir}/{j}.txt' assert os.path.exists(fname) == expect if expect: with open(fname) as fp: assert fp.read() != '' @pytest.fixture
def foo(self, **kwargs): return DocumentArray([Document(), Document()])
class CrudIndexer(Executor): """Simple indexer class """ def __init__(self, **kwargs): super().__init__(**kwargs) self.logger = JinaLogger('CrudIndexer') self._docs = DocumentArray() self._dump_location = os.path.join(self.metas.workspace, 'docs') if os.path.exists(self._dump_location): self._docs = DocumentArray.load(self._dump_location) self.logger.info(f'Loaded {len(self._docs)} from {self._dump_location}') else: self.logger.info(f'No data found at {self._dump_location}') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/update') def update(self, docs: 'DocumentArray', **kwargs): self.delete(docs) self.index(docs) def close(self) -> None: self.logger.info(f'Dumping {len(self._docs)} to {self._dump_location}') self._docs.save(self._dump_location) @requests(on='/delete') def delete(self, docs: 'DocumentArray', **kwargs): # TODO we can do del _docs[d.id] once # tests.unit.types.arrays.test_documentarray.test_delete_by_id is fixed ids_to_delete = [d.id for d in docs] idx_to_delete = [] for i, doc in enumerate(self._docs): if doc.id in ids_to_delete: idx_to_delete.append(i) for i in sorted(idx_to_delete, reverse=True): del self._docs[i] @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs): top_k = int(parameters.get('top_k', 1)) a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) idx, dist = self._get_sorted_top_k(dists, top_k) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): d = Document(self._docs[int(_id)], copy=True) d.scores['cosine'] = 1 - _dist _q.matches.append(d) @staticmethod def _get_sorted_top_k( dist: 'np.array', top_k: int ) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist
def __init__(self, **kwargs): super().__init__(**kwargs) self._docs = DocumentArray()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if os.path.exists(self.save_path): self._docs = DocumentArray.load(self.save_path) else: self._docs = DocumentArray()
def test_pca_plot_generated(embeddings, tmpdir): doc_array = DocumentArray([Document(embedding=x) for x in embeddings]) file_path = os.path.join(tmpdir, 'pca_plot.png') doc_array.visualize(output=file_path) assert os.path.exists(file_path)
def status(self, **kwargs): # returns ids of all docs in tags return DocumentArray(Document(tags={'ids': self.docs[:, 'id']}))
def no_polling(self, docs: DocumentArray, **kwargs): docs.append(Document(text='added')) return docs
def test_input_lines_with_empty_filepath_and_lines(): with pytest.raises(ValueError): lines = DocumentArray.from_lines(lines=None, filepath=None) for _ in lines: pass