def _find(self, query, progress=None): query_vec = Vectors( self._encoder.encode(prepare_docs([query], nlp=self._nlp))) query_vec = query_vec.normalized query_vec = query_vec.astype(np.float32) if query_vec.shape[0] != 1: raise RuntimeError("query produced more than one embedding") if self._ip_to_l2: faiss_query_vec = augment_xq(query_vec) else: faiss_query_vec = query_vec distance, index = self._index.search(faiss_query_vec, query.options["max_matches"]) matches = [] for d, i in zip(distance[0], index[0]): if i < 0: break doc_index, sent_index = self._unpack_index(i) matches.append(self._make_match(query, d, doc_index, sent_index)) return matches
def _find(self, query, progress=None): query_vec = Vectors( self._encoder.encode(prepare_docs([query], nlp=self._nlp))) if query_vec.unmodified.shape[0] != 1: raise RuntimeError("query produced more than one embedding") sim = np.zeros((self._corpus_vec.unmodified.shape[0], 1), dtype=query_vec.unmodified.dtype) self._vector_sim.compute(self._corpus_vec, query_vec, sim) sim = sim.reshape(-1) sim[np.isnan(sim)] = -np.inf index = np.argsort(sim) max_matches = query.options["max_matches"] matches = [] for i in reversed(index[-max_matches::]): if sim[i] < 0: break doc_index, sent_index = self._unpack_index(i) matches.append( self._make_match(query, sim[i], doc_index, sent_index)) return matches
def transformed(k, v): v = Vectors(np.vstack(v)) embedding = emb_by_name[k] if embedding.transform: v = embedding.transform.apply(v) return ProxyVectorsRef(v)
def __init__(self, partition, embedding, span_sim, nlp, vector_sim, vectors=None): super().__init__(partition, embedding, span_sim, nlp) self._vector_sim = vector_sim if vectors is not None: corpus_vec = Vectors(vectors) else: corpus_vec = Vectors( self._encoder.encode(self._session.documents, pbar=True)) self._corpus_vec = corpus_vec
def __init__(self, partition, embedding, span_sim, nlp, vectors=None, faiss_description='Flat'): import faiss super().__init__(partition, embedding, span_sim, nlp) if vectors is not None: corpus_vec = vectors else: corpus_vec = Vectors( self._encoder.encode(self._session.documents, pbar=True)) corpus_vec = corpus_vec.normalized corpus_vec = corpus_vec.astype(np.float32) # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index # https://github.com/facebookresearch/faiss/wiki/The-index-factory # e.g. ""Flat", "PCA128,Flat", "LSH" self._ip_to_l2 = faiss_description.split(",")[-1] != "Flat" if self._ip_to_l2: corpus_vec = augment_xb(corpus_vec) metric = faiss.METRIC_L2 else: metric = faiss.METRIC_INNER_PRODUCT n_dims = corpus_vec.shape[-1] index = faiss.index_factory(n_dims, faiss_description, metric) index.train(corpus_vec) index.add(corpus_vec) self._index = index self._corpus_vec = corpus_vec
def similarity(self, token_sim, a, b): from vectorian.corpus.document import Token out = np.zeros((1, 1), dtype=np.float32) if token_sim.is_modifier: x = np.zeros((len(token_sim.operands), 1), dtype=np.float32) for i, op in enumerate(token_sim.operands): x[i] = self.similarity(op, a, b) token_sim(x, out) else: encoder = self.to_encoder(token_sim.embedding) if encoder.is_static: if isinstance(a, Token): a = a.text if isinstance(b, Token): b = b.text va = Vectors([encoder.word_vec(a)]) vb = Vectors([encoder.word_vec(b)]) elif encoder.is_contextual: if not isinstance(a, Token): raise ValueError(f"expected a Token, got {a}") if not isinstance(b, Token): raise ValueError(f"expected a Token, got {b}") with a.doc.contextual_embeddings[encoder.name].open() as vec: va = Vectors([vec.unmodified[a.index]]) with b.doc.contextual_embeddings[encoder.name].open() as vec: vb = Vectors([vec.unmodified[b.index]]) else: raise ValueError() token_sim.similarity(va, vb, out) return out[0, 0]
def __init__(self, query, vocab, nlp): self._query = query self._vocab = vocab doc = nlp(self.text_str) # FIXME gather contextual_embeddings actually used in this query # by analyzing query contextual_embeddings = collections.defaultdict(list) for encoder in self._query.index.session.encoders.values(): if encoder.is_contextual: v = encoder.encode([doc]) assert v.shape[0] == 1 contextual_embeddings[encoder.name].append(v[0]) contextual_embeddings = dict( (k, np.vstack(v)) for k, v in contextual_embeddings.items()) tokens = doc.to_json()["tokens"] for token_attr in ('pos', 'tag'): # FIXME token_filter mask = self._mask(tokens, f'{token_attr}_filter', token_attr) if mask is not None: tokens = [t for t, m in zip(tokens, mask) if m] contextual_embeddings = dict( (k, v[mask, :]) for k, v in contextual_embeddings.items()) token_mask = np.zeros((len(tokens), ), dtype=np.bool) token_table = TokenTable(self.text_str, self.index.session.normalizers) for i, t in enumerate(tokens): token_mask[i] = token_table.add(t) contextual_embeddings = dict( (k, v[token_mask, :]) for k, v in contextual_embeddings.items()) self._contextual_embeddings = dict( (k, ProxyVectorsRef(Vectors(v))) for k, v in contextual_embeddings.items()) query = core.Query(self.index, self._vocab, self._contextual_embeddings) query.initialize(token_table.to_dict(), **self._query.options) self._compiled = query self._tokens = self._compiled.tokens
def corpus_vec(self): raise Vectors(self._corpus_vec)