class KeyValueIndexer(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if os.path.exists(self.save_path): self._docs = DocumentArray.load(self.save_path) else: self._docs = DocumentArray() @property def save_path(self): if not os.path.exists(self.workspace): os.makedirs(self.workspace) return os.path.join(self.workspace, 'kv.json') def close(self): self._docs.save(self.save_path) @requests(on='/index') def index(self, docs: DocumentArray, **kwargs): self._docs.extend(docs) @requests(on='/search') def query(self, docs: DocumentArray, **kwargs): for doc in docs: for match in doc.matches: extracted_doc = self._docs[match.parent_id] match.MergeFrom(extracted_doc)
class EmbeddingIndexer(Executor): def __init__(self, index_file_name: str, **kwargs): super().__init__(**kwargs) self.index_file_name = index_file_name if os.path.exists(self.save_path): self._docs = DocumentArray.load(self.save_path) else: self._docs = DocumentArray() @property def save_path(self): if not os.path.exists(self.workspace): os.makedirs(self.workspace) return os.path.join(self.workspace, self.index_file_name) def close(self): self._docs.save(self.save_path) @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs) -> DocumentArray: embedding_docs = DocumentArray() for doc in docs: embedding_docs.append(Document(id=doc.id, embedding=doc.embedding)) self._docs.extend(embedding_docs) return docs @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs) \ -> DocumentArray: a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) top_k = int(parameters.get('top_k', 5)) assert top_k > 0 idx, dist = self._get_sorted_top_k(dists, top_k) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): doc = Document(self._docs[int(_id)], copy=True) doc.score.value = 1 - _dist doc.parent_id = int(_id) _q.matches.append(doc) return docs @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist
def da_and_dam(): rand_docs = random_docs(100) da = DocumentArray() da.extend(rand_docs) rand_docs = random_docs(100) dam = DocumentArrayMemmap() dam.extend(rand_docs) return da, dam
def on_done(response, final_da: DocumentArray): docs = response.docs for doc in docs: doc.tags['on_done'] = time.time() print( f'in on_done {doc.id}, time: {readable_time_from(doc.tags["on_done"])}', flush=True, ) final_da.extend(docs)
def on_done(response, final_da: DocumentArray): print(f' receiving response {response._pb_body.request_id}') for doc in response.docs: doc.tags['on_done'] = time.time() print( f'in on_done {doc.id}, time: {readable_time_from(doc.tags["on_done"])}, {doc.tags["on_done"]}', flush=True, ) final_da.extend(response.docs)
class DummyMockConnectionPool: def send_requests_once( self, requests, deployment: str, head: bool, endpoint: str = None, timeout: float = 1.0, retries: int = -1, ) -> asyncio.Task: assert head request = requests[0] if not hasattr(self, '_docs'): self._docs = DocumentArray() async def _compute_response(): response_msg = copy.deepcopy(request) exec_endpoint = request.header.exec_endpoint new_docs = DocumentArray() await asyncio.sleep(0.1) if deployment == 'indexer-executor': if exec_endpoint == '/index': time.sleep(0.1) self._docs.extend(request.docs) else: docs = response_msg.docs docs.clear() docs.extend( DocumentArray( Document(tags={'ids': self._docs[:, 'id']}))) response_msg.data.docs = docs return response_msg else: if deployment == 'slow-executor': await asyncio.sleep(SLOW_EXECUTOR_SLEEP_TIME) for doc in request.docs: new_doc = Document(doc, copy=True) new_doc.tags['executor'] = time.time() print( f'in {deployment}, {new_doc.id} => time: {readable_time_from(new_doc.tags["executor"])}, {new_doc.tags["executor"]}', flush=True, ) new_docs.append(new_doc) docs = response_msg.docs docs.clear() docs.extend(new_docs) response_msg.data.docs = docs return response_msg async def task_wrapper(): response_msg = await _compute_response() return response_msg, {} return asyncio.create_task(task_wrapper())
class DBMSExecutor(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._docs = DocumentArray() self.logger = JinaLogger('IndexExecutor') @requests(on='/index') def index(self, docs: 'DocumentArray', *args, **kwargs): self._docs.extend(docs) @requests(on='/dump') def dump(self, parameters, *args, **kwargs): dump_path = parameters['dump_path'] self._docs.save(dump_path)
class DummyCSRSparseIndexEncoder(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.docs = DocumentArray() @requests(on='/index') def encode(self, docs: 'DocumentArray', *args, **kwargs) -> Any: for i, doc in enumerate(docs): doc.embedding = sparse.coo_matrix(doc.content) self.docs.extend(docs) @requests(on='/search') def query(self, docs: 'DocumentArray', parameters, *args, **kwargs): top_k = int(parameters['top_k']) for doc in docs: doc.matches = self.docs[:top_k]
class KeyValueDBMSIndexer(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._docs = DocumentArray() self.logger = JinaLogger('KeyValueDBMSIndexer') @requests(on='/index') def index(self, docs: 'DocumentArray', *args, **kwargs): self._docs.extend(docs) # TODO endpoint in tests.distributed.test_remote_flow_dump_rolling_update.test_dump_dbms_remote.test_dump_dbms_remote # ends up being http://0.0.0.0:9000/post/dump @requests(on='/dump') def dump(self, parameters, *args, **kwargs): dump_path = parameters['dump_path'] # TODO: maybe put some logic for shards here self._docs.save(dump_path)
class DummyCSRSparseIndexEncoder(Executor): embedding_cls_type = 'scipy_csr' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.docs = DocumentArray() self.vectors = {} @requests(on='index') def encode(self, docs: 'DocumentArray', *args, **kwargs) -> Any: self.docs.extend(docs) for i, doc in enumerate(self.docs): doc.embedding = sparse.csr_matrix(doc.content) self.vectors[doc.id] = doc.embedding.getrow(i) @requests(on='search') def query(self, parameters, *args, **kwargs): top_k = parameters['top_k'] doc = parameters['doc'] distances = [item for item in range(0, min(top_k, len(self.docs)))] return [self.docs[:top_k]], np.array([distances])
class MyIndexer(Executor): """Simple indexer class """ def __init__(self, **kwargs): super().__init__(**kwargs) self._docs = DocumentArray() @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/search') def search(self, docs: 'DocumentArray', **kwargs): a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) idx, dist = self._get_sorted_top_k(dists, 1) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): d = Document(self._docs[int(_id)], copy=True) d.score.value = 1 - _dist _q.matches.append(d) @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist
class CrudIndexer(Executor): """Simple indexer class""" def __init__(self, **kwargs): super().__init__(**kwargs) self.logger = JinaLogger('CrudIndexer') self._docs = DocumentArray() self._dump_location = os.path.join(self.metas.workspace, 'docs') if os.path.exists(self._dump_location): self._docs = DocumentArray.load(self._dump_location) self.logger.info( f'Loaded {len(self._docs)} from {self._dump_location}') else: self.logger.info(f'No data found at {self._dump_location}') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/update') def update(self, docs: 'DocumentArray', **kwargs): self.delete(docs) self.index(docs) def close(self) -> None: self.logger.info(f'Dumping {len(self._docs)} to {self._dump_location}') self._docs.save(self._dump_location) @requests(on='/delete') def delete(self, docs: 'DocumentArray', **kwargs): # TODO we can do del _docs[d.id] once # tests.unit.types.arrays.test_documentarray.test_delete_by_id is fixed ids_to_delete = [d.id for d in docs] idx_to_delete = [] for i, doc in enumerate(self._docs): if doc.id in ids_to_delete: idx_to_delete.append(i) for i in sorted(idx_to_delete, reverse=True): del self._docs[i] @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs): top_k = int(parameters.get('top_k', 1)) a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) idx, dist = self._get_sorted_top_k(dists, top_k) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): d = Document(self._docs[int(_id)], copy=True) d.scores['cosine'] = 1 - _dist _q.matches.append(d) @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist
def merge(self, docs_matrix: DocumentArray, **kwargs): merged_docs = DocumentArray() for docs in docs_matrix: merged_docs.extend(docs) return merged_docs
def search(self, docs: DocumentArray, **kwargs): docs.clear() docs.extend(self._docs)
def docarray_for_cache(): da = DocumentArray() d1 = Document(id=1) d2 = Document(id='2') da.extend([d1, d2]) return da
def test_none_extend(): da = DocumentArray([Document() for _ in range(100)]) da.extend(None) assert len(da) == 100