def encode(self, docs: DocumentArray, **kwargs): content = np.stack(docs.get_attributes('blob')) _input = torch.from_numpy(content.astype('float32')) _features = self._get_features(_input).detach() _features = _features.numpy() _features = self._get_pooling(_features) docs.embeddings = _features
class EmbeddingIndexer(Executor): def __init__(self, index_file_name: str, **kwargs): super().__init__(**kwargs) self.index_file_name = index_file_name if os.path.exists(self.save_path): self._docs = DocumentArray.load(self.save_path) else: self._docs = DocumentArray() @property def save_path(self): if not os.path.exists(self.workspace): os.makedirs(self.workspace) return os.path.join(self.workspace, self.index_file_name) def close(self): self._docs.save(self.save_path) @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs) -> DocumentArray: embedding_docs = DocumentArray() for doc in docs: embedding_docs.append(Document(id=doc.id, embedding=doc.embedding)) self._docs.extend(embedding_docs) return docs @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs) \ -> DocumentArray: a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) top_k = int(parameters.get('top_k', 5)) assert top_k > 0 idx, dist = self._get_sorted_top_k(dists, top_k) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): doc = Document(self._docs[int(_id)], copy=True) doc.score.value = 1 - _dist doc.parent_id = int(_id) _q.matches.append(doc) return docs @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist
def encode(self, docs: DocumentArray, **kwargs): if docs is None: return images = np.stack(docs.get_attributes('blob')) images = self._maybe_move_channel_axis(images) _input = torch.from_numpy(images) features = self._get_features(_input).detach() features = self._get_pooling(features.numpy()) for doc, embed in zip(docs, features): doc.embedding = embed return docs
def encode(self, docs: 'DocumentArray', *args, **kwargs): chunks = DocumentArray( list( filter(lambda d: d.mime_type == 'text/plain', docs.traverse_flat(['c'])))) texts = chunks.get_attributes('text') with torch.no_grad(): if not self.tokenizer.pad_token: self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) self.model.resize_token_embeddings(len(self.tokenizer.vocab)) input_tokens = self.tokenizer( texts, max_length=self.max_length, padding='longest', truncation=True, return_tensors='pt', ) input_tokens = { k: v.to(torch.device('cpu')) for k, v in input_tokens.items() } outputs = getattr(self.model, self.embedding_fn_name)(**input_tokens) if isinstance(outputs, torch.Tensor): return outputs.cpu().numpy() hidden_states = outputs.hidden_states embeds = self._compute_embedding(hidden_states, input_tokens) for doc, embed in zip(chunks, embeds): doc.embedding = embed return chunks
class CompoundQueryExecutor(Executor): def __init__(self, dump_path: Optional[str] = None, *args, **kwargs): super().__init__(*args, **kwargs) self.logger = JinaLogger('CompoundQueryExecutor') self._dump_path = dump_path if self._dump_path is not None and os.path.exists(self._dump_path): self._docs = DocumentArray.load(self._dump_path) else: self._docs = DocumentArray() @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int): if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist @requests(on='/search') def search(self, docs: 'DocumentArray', parameters, **kwargs): if len(self._docs) > 0: a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) idx, dist = self._get_sorted_top_k(dists, int(parameters['top_k'])) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): d = Document(self._docs[int(_id)], copy=True) d.scores['cosine'] = 1 - _dist _q.matches.append(d)
class MyIndexer(Executor): """Simple indexer class """ def __init__(self, **kwargs): super().__init__(**kwargs) self._docs = DocumentArray() @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/search') def search(self, docs: 'DocumentArray', **kwargs): a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) idx, dist = self._get_sorted_top_k(dists, 1) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): d = Document(self._docs[int(_id)], copy=True) d.score.value = 1 - _dist _q.matches.append(d) @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist
class CrudIndexer(Executor): """Simple indexer class""" def __init__(self, **kwargs): super().__init__(**kwargs) self.logger = JinaLogger('CrudIndexer') self._docs = DocumentArray() self._dump_location = os.path.join(self.metas.workspace, 'docs') if os.path.exists(self._dump_location): self._docs = DocumentArray.load(self._dump_location) self.logger.info( f'Loaded {len(self._docs)} from {self._dump_location}') else: self.logger.info(f'No data found at {self._dump_location}') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/update') def update(self, docs: 'DocumentArray', **kwargs): self.delete(docs) self.index(docs) def close(self) -> None: self.logger.info(f'Dumping {len(self._docs)} to {self._dump_location}') self._docs.save(self._dump_location) @requests(on='/delete') def delete(self, docs: 'DocumentArray', **kwargs): # TODO we can do del _docs[d.id] once # tests.unit.types.arrays.test_documentarray.test_delete_by_id is fixed ids_to_delete = [d.id for d in docs] idx_to_delete = [] for i, doc in enumerate(self._docs): if doc.id in ids_to_delete: idx_to_delete.append(i) for i in sorted(idx_to_delete, reverse=True): del self._docs[i] @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs): top_k = int(parameters.get('top_k', 1)) a = np.stack(docs.get_attributes('embedding')) b = np.stack(self._docs.get_attributes('embedding')) q_emb = _ext_A(_norm(a)) d_emb = _ext_B(_norm(b)) dists = _cosine(q_emb, d_emb) idx, dist = self._get_sorted_top_k(dists, top_k) for _q, _ids, _dists in zip(docs, idx, dist): for _id, _dist in zip(_ids, _dists): d = Document(self._docs[int(_id)], copy=True) d.scores['cosine'] = 1 - _dist _q.matches.append(d) @staticmethod def _get_sorted_top_k(dist: 'np.array', top_k: int) -> Tuple['np.ndarray', 'np.ndarray']: if top_k >= dist.shape[1]: idx = dist.argsort(axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx, axis=1) else: idx_ps = dist.argpartition(kth=top_k, axis=1)[:, :top_k] dist = np.take_along_axis(dist, idx_ps, axis=1) idx_fs = dist.argsort(axis=1) idx = np.take_along_axis(idx_ps, idx_fs, axis=1) dist = np.take_along_axis(dist, idx_fs, axis=1) return idx, dist
def test_texts_getter_da(): da = DocumentArray([Document(text='hello') for _ in range(100)]) assert len(da.texts) == 100 assert da.texts == da.get_attributes('text')
def test_tags_getter_da(): da = DocumentArray([Document(tags={'a': 2, 'c': 'd'}) for _ in range(100)]) assert len(da.tags) == 100 assert da.tags == da.get_attributes('tags')
def test_blobs_getter_da(): blobs = np.random.random((100, 10, 10)) da = DocumentArray([Document(blob=blob) for blob in blobs]) assert len(da) == 100 np.testing.assert_almost_equal(da.get_attributes('blob'), da.blobs)
def test_embeddings_getter_da(): embeddings = np.random.random((100, 10)) da = DocumentArray([Document(embedding=emb) for emb in embeddings]) assert len(da) == 100 np.testing.assert_almost_equal(da.get_attributes('embedding'), da.embeddings)
def test_da_get_embeddings_slice(): da = DocumentArray(random_docs(100)) np.testing.assert_almost_equal( da.get_attributes('embedding')[10:20], da._get_embeddings(slice(10, 20)))
def test_da_get_embeddings(): da = DocumentArray(random_docs(100)) np.testing.assert_almost_equal(da.get_attributes('embedding'), da.embeddings)
def index(self, docs: DocumentArray, **kwargs): self.db.insert_multiple(docs.get_attributes('tags'))