Ejemplo n.º 1
0
    def _find(self, query, progress=None):
        query_vec = Vectors(
            self._encoder.encode(prepare_docs([query], nlp=self._nlp)))
        query_vec = query_vec.normalized
        query_vec = query_vec.astype(np.float32)

        if query_vec.shape[0] != 1:
            raise RuntimeError("query produced more than one embedding")

        if self._ip_to_l2:
            faiss_query_vec = augment_xq(query_vec)
        else:
            faiss_query_vec = query_vec

        distance, index = self._index.search(faiss_query_vec,
                                             query.options["max_matches"])

        matches = []
        for d, i in zip(distance[0], index[0]):
            if i < 0:
                break

            doc_index, sent_index = self._unpack_index(i)

            matches.append(self._make_match(query, d, doc_index, sent_index))

        return matches
Ejemplo n.º 2
0
    def _find(self, query, progress=None):
        query_vec = Vectors(
            self._encoder.encode(prepare_docs([query], nlp=self._nlp)))

        if query_vec.unmodified.shape[0] != 1:
            raise RuntimeError("query produced more than one embedding")

        sim = np.zeros((self._corpus_vec.unmodified.shape[0], 1),
                       dtype=query_vec.unmodified.dtype)

        self._vector_sim.compute(self._corpus_vec, query_vec, sim)

        sim = sim.reshape(-1)
        sim[np.isnan(sim)] = -np.inf
        index = np.argsort(sim)

        max_matches = query.options["max_matches"]

        matches = []
        for i in reversed(index[-max_matches::]):
            if sim[i] < 0:
                break

            doc_index, sent_index = self._unpack_index(i)

            matches.append(
                self._make_match(query, sim[i], doc_index, sent_index))

        return matches
Ejemplo n.º 3
0
        def transformed(k, v):
            v = Vectors(np.vstack(v))

            embedding = emb_by_name[k]
            if embedding.transform:
                v = embedding.transform.apply(v)

            return ProxyVectorsRef(v)
Ejemplo n.º 4
0
    def __init__(self,
                 partition,
                 embedding,
                 span_sim,
                 nlp,
                 vector_sim,
                 vectors=None):
        super().__init__(partition, embedding, span_sim, nlp)

        self._vector_sim = vector_sim

        if vectors is not None:
            corpus_vec = Vectors(vectors)
        else:
            corpus_vec = Vectors(
                self._encoder.encode(self._session.documents, pbar=True))

        self._corpus_vec = corpus_vec
Ejemplo n.º 5
0
    def __init__(self,
                 partition,
                 embedding,
                 span_sim,
                 nlp,
                 vectors=None,
                 faiss_description='Flat'):
        import faiss

        super().__init__(partition, embedding, span_sim, nlp)

        if vectors is not None:
            corpus_vec = vectors
        else:
            corpus_vec = Vectors(
                self._encoder.encode(self._session.documents, pbar=True))
            corpus_vec = corpus_vec.normalized

        corpus_vec = corpus_vec.astype(np.float32)

        # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
        # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
        # https://github.com/facebookresearch/faiss/wiki/The-index-factory
        # e.g. ""Flat", "PCA128,Flat", "LSH"

        self._ip_to_l2 = faiss_description.split(",")[-1] != "Flat"

        if self._ip_to_l2:
            corpus_vec = augment_xb(corpus_vec)
            metric = faiss.METRIC_L2
        else:
            metric = faiss.METRIC_INNER_PRODUCT

        n_dims = corpus_vec.shape[-1]

        index = faiss.index_factory(n_dims, faiss_description, metric)
        index.train(corpus_vec)
        index.add(corpus_vec)

        self._index = index
        self._corpus_vec = corpus_vec
Ejemplo n.º 6
0
    def similarity(self, token_sim, a, b):
        from vectorian.corpus.document import Token

        out = np.zeros((1, 1), dtype=np.float32)
        if token_sim.is_modifier:
            x = np.zeros((len(token_sim.operands), 1), dtype=np.float32)
            for i, op in enumerate(token_sim.operands):
                x[i] = self.similarity(op, a, b)
            token_sim(x, out)

        else:
            encoder = self.to_encoder(token_sim.embedding)
            if encoder.is_static:
                if isinstance(a, Token):
                    a = a.text
                if isinstance(b, Token):
                    b = b.text

                va = Vectors([encoder.word_vec(a)])
                vb = Vectors([encoder.word_vec(b)])

            elif encoder.is_contextual:
                if not isinstance(a, Token):
                    raise ValueError(f"expected a Token, got {a}")
                if not isinstance(b, Token):
                    raise ValueError(f"expected a Token, got {b}")

                with a.doc.contextual_embeddings[encoder.name].open() as vec:
                    va = Vectors([vec.unmodified[a.index]])

                with b.doc.contextual_embeddings[encoder.name].open() as vec:
                    vb = Vectors([vec.unmodified[b.index]])
            else:
                raise ValueError()

            token_sim.similarity(va, vb, out)

        return out[0, 0]
Ejemplo n.º 7
0
    def __init__(self, query, vocab, nlp):
        self._query = query
        self._vocab = vocab

        doc = nlp(self.text_str)

        # FIXME gather contextual_embeddings actually used in this query
        # by analyzing query

        contextual_embeddings = collections.defaultdict(list)
        for encoder in self._query.index.session.encoders.values():
            if encoder.is_contextual:
                v = encoder.encode([doc])
                assert v.shape[0] == 1
                contextual_embeddings[encoder.name].append(v[0])

        contextual_embeddings = dict(
            (k, np.vstack(v)) for k, v in contextual_embeddings.items())

        tokens = doc.to_json()["tokens"]

        for token_attr in ('pos', 'tag'):  # FIXME token_filter
            mask = self._mask(tokens, f'{token_attr}_filter', token_attr)
            if mask is not None:
                tokens = [t for t, m in zip(tokens, mask) if m]
                contextual_embeddings = dict(
                    (k, v[mask, :]) for k, v in contextual_embeddings.items())

        token_mask = np.zeros((len(tokens), ), dtype=np.bool)
        token_table = TokenTable(self.text_str, self.index.session.normalizers)
        for i, t in enumerate(tokens):
            token_mask[i] = token_table.add(t)

        contextual_embeddings = dict(
            (k, v[token_mask, :]) for k, v in contextual_embeddings.items())

        self._contextual_embeddings = dict(
            (k, ProxyVectorsRef(Vectors(v)))
            for k, v in contextual_embeddings.items())

        query = core.Query(self.index, self._vocab,
                           self._contextual_embeddings)

        query.initialize(token_table.to_dict(), **self._query.options)

        self._compiled = query
        self._tokens = self._compiled.tokens
Ejemplo n.º 8
0
 def corpus_vec(self):
     raise Vectors(self._corpus_vec)