class KeyValueIndexer(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if os.path.exists(self.save_path): self._docs = DocumentArray.load(self.save_path) else: self._docs = DocumentArray() @property def save_path(self): if not os.path.exists(self.workspace): os.makedirs(self.workspace) return os.path.join(self.workspace, 'kv.json') def close(self): self._docs.save(self.save_path) @requests(on='/index') def index(self, docs: DocumentArray, **kwargs): self._docs.extend(docs) @requests(on='/search') def query(self, docs: DocumentArray, **kwargs): for doc in docs: for match in doc.matches: extracted_doc = self._docs[match.parent_id] match.MergeFrom(extracted_doc)
class EmbeddingIndexer(Executor): def __init__(self, index_file_name: str, **kwargs): super().__init__(**kwargs) self.index_file_name = index_file_name if os.path.exists(self.save_path): self._docs = DocumentArray.load(self.save_path) else: self._docs = DocumentArray() @property def save_path(self): if not os.path.exists(self.workspace): os.makedirs(self.workspace) return os.path.join(self.workspace, self.index_file_name) def close(self): self._docs.save(self.save_path) @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs) -> DocumentArray: embedding_docs = DocumentArray() for doc in docs: embedding_docs.append(Document(id=doc.id, embedding=doc.embedding)) self._docs.extend(embedding_docs) return docs @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs) \ -> DocumentArray: a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) top_k = int(parameters.get('top_k', 5)) assert top_k > 0 idx, dist = self._get_sorted_top_k(dists, top_k) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): doc = Document(self._docs[int(_id)], copy=True) doc.score.value = 1 - _dist doc.parent_id = int(_id) _q.matches.append(doc) return docs @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist
class DBMSExecutor(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._docs = DocumentArray() self.logger = JinaLogger('IndexExecutor') @requests(on='/index') def index(self, docs: 'DocumentArray', *args, **kwargs): self._docs.extend(docs) @requests(on='/dump') def dump(self, parameters, *args, **kwargs): dump_path = parameters['dump_path'] self._docs.save(dump_path)
class KeyValueDBMSIndexer(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._docs = DocumentArray() self.logger = JinaLogger('KeyValueDBMSIndexer') @requests(on='/index') def index(self, docs: 'DocumentArray', *args, **kwargs): self._docs.extend(docs) # TODO endpoint in tests.distributed.test_remote_flow_dump_rolling_update.test_dump_dbms_remote.test_dump_dbms_remote # ends up being http://0.0.0.0:9000/post/dump @requests(on='/dump') def dump(self, parameters, *args, **kwargs): dump_path = parameters['dump_path'] # TODO: maybe put some logic for shards here self._docs.save(dump_path)
class CrudIndexer(Executor): """Simple indexer class""" def __init__(self, **kwargs): super().__init__(**kwargs) self.logger = JinaLogger('CrudIndexer') self._docs = DocumentArray() self._dump_location = os.path.join(self.metas.workspace, 'docs') if os.path.exists(self._dump_location): self._docs = DocumentArray.load(self._dump_location) self.logger.info( f'Loaded {len(self._docs)} from {self._dump_location}') else: self.logger.info(f'No data found at {self._dump_location}') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/update') def update(self, docs: 'DocumentArray', **kwargs): self.delete(docs) self.index(docs) def close(self) -> None: self.logger.info(f'Dumping {len(self._docs)} to {self._dump_location}') self._docs.save(self._dump_location) @requests(on='/delete') def delete(self, docs: 'DocumentArray', **kwargs): # TODO we can do del _docs[d.id] once # tests.unit.types.arrays.test_documentarray.test_delete_by_id is fixed ids_to_delete = [d.id for d in docs] idx_to_delete = [] for i, doc in enumerate(self._docs): if doc.id in ids_to_delete: idx_to_delete.append(i) for i in sorted(idx_to_delete, reverse=True): del self._docs[i] @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs): top_k = int(parameters.get('top_k', 1)) a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) idx, dist = self._get_sorted_top_k(dists, top_k) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): d = Document(self._docs[int(_id)], copy=True) d.scores['cosine'] = 1 - _dist _q.matches.append(d) @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist