Example #1
0
class FastWordCentroidRetrieval(BaseEstimator, RetriEvalMixin):

    """Docstring for FastWordCentrodRetrieval. """

    def __init__(self, embedding, analyzer='word', matching=None, name="FWCD",
                 n_jobs=1, use_idf=True):
        """TODO: to be defined1. """
        self.name = name
        self.matching = Matching(**dict(matching)) if matching else None
        self.vect = EmbeddedVectorizer(embedding, analyzer=analyzer, norm='l2',
                                       use_idf=use_idf)
        self.nn = NearestNeighbors(n_jobs=n_jobs, metric='cosine',
                                   algorithm='brute')

    def fit(self, X_raw, y=None):
        cents = self.vect.fit_transform(X_raw)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        # cents = all_but_the_top(cents, 1)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        # print("Renormalizing")
        # normalize(cents, copy=False)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        self.centroids = cents
        print(' FIT centroids shape', self.centroids.shape)

        self._y = y
        if self.matching:
            self.matching.fit(X_raw)
        else:
            self.nn.fit(cents)

    def query(self, query, k=None, indices=None):
        centroids = self.centroids

        if k is None:
            k = centroids.shape[0]

        q_centroid = self.vect.transform([query])

        if self.matching:
            ind = self.matching.predict(query)
            centroids, labels = centroids[ind], self._y[ind]
            n_ret = min(k, centroids.shape[0])
            if n_ret == 0:
                return []
            self.nn.fit(centroids)
        elif indices:
            centroids, labels = centroids[ind], self._y[ind]
            n_ret = min(k, centroids.shape[0])
            if n_ret == 0:
                return []
            self.nn.fit(centroids)
        else:
            labels = self._y
            n_ret = k

        ind = self.nn.kneighbors(q_centroid, n_neighbors=n_ret,
                                 return_distance=False)[0]

        return labels[ind]
Example #2
0
class WordCentroidRetrieval(BaseEstimator, RetriEvalMixin):
    """
    Retrieval Model based on Word Centroid Distance
    """
    def __init__(self,
                 embedding,
                 analyzer,
                 name="WCD",
                 n_jobs=1,
                 normalize=True,
                 verbose=0,
                 oov=None,
                 matching=True,
                 **kwargs):
        self.name = name
        self._embedding = embedding
        self._normalize = normalize
        self._oov = oov
        self.verbose = verbose
        self.n_jobs = n_jobs
        self._neighbors = NearestNeighbors(**kwargs)

        self._analyzer = analyzer

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))

    def _compute_centroid(self, words):
        if len(words) == 0:  # no words left at all? could also return zeros
            return self._embedding[self._oov]
        E = self._embedding
        embedded_words = np.vstack([E[word] for word in words])
        centroid = np.mean(embedded_words, axis=0).reshape(1, -1)
        return centroid

    def fit(self, docs, labels):
        E, analyze = self._embedding, self._analyzer

        analyzed_docs = (analyze(doc) for doc in docs)
        # out of vocabulary words do not have to contribute to the centroid

        filtered_docs = (filter_vocab(E, d, self._oov) for d in analyzed_docs)
        centroids = np.vstack([self._compute_centroid(doc) for doc in
                               filtered_docs])  # can we generate?
        if self.verbose > 0:
            print("Centroids shape:", centroids.shape)
        if self._normalize:
            normalize(centroids, norm='l2', copy=False)

        self._y = np.asarray(labels)

        if self._matching:
            self._matching.fit(docs)
            self._centroids = centroids
        else:
            # if we dont do matching, its enough to fit a nearest neighbors on
            # all centroids before query time
            self._neighbors.fit(centroids)
        return self

    def query(self, query, k=None, return_distance=False):
        if k is None:
            k = len(self._centroids)
        E, analyze, nn = self._embedding, self._analyzer, self._neighbors
        tokens = analyze(query)
        words = filter_vocab(E, tokens, self._oov)
        query_centroid = self._compute_centroid(words)
        if self._normalize:
            query_centroid = normalize(query_centroid, norm='l2', copy=False)
        if self.verbose > 0:
            print("Analyzed query", words)
            # print("Centered (normalized) query shape", query_centroid.shape)

        if self._matching:
            matched = self._matching.predict(query)
            centroids, labels = self._centroids[matched], self._y[matched]
            if len(centroids) == 0:
                return []  # nothing to fit here
            nn.fit(centroids)
            # k `leq` n_matched
            n_ret = min(k, len(matched))
        else:
            labels = self._y
            n_ret = k

        # either fit nn on the fly or precomputed in own fit method
        dist, ind = nn.kneighbors(query_centroid, n_neighbors=n_ret,
                                  return_distance=True)

        dist, ind = dist[0], ind[0]  # we only had one query in the first place

        if return_distance:
            return labels[ind], dist
        else:
            return labels[ind]
Example #3
0
class FastWordCentroidRetrieval(BaseEstimator, RetriEvalMixin):

    """Docstring for FastWordCentrodRetrieval. """

    def __init__(self, embedding, analyzer='word', matching=None, name="FWCD",
                 n_jobs=1, use_idf=True):
        """TODO: to be defined1. """
        self.name = name
        self.matching = Matching(**dict(matching)) if matching else None
        self.vect = EmbeddedVectorizer(embedding, analyzer=analyzer, norm='l2',
                                       use_idf=use_idf)
        self.nn = NearestNeighbors(n_jobs=n_jobs, metric='cosine',
                                   algorithm='brute')

    def fit(self, X_raw, y=None):
        cents = self.vect.fit_transform(X_raw)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        # cents = all_but_the_top(cents, 1)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        # print("Renormalizing")
        # normalize(cents, copy=False)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        self.centroids = cents
        print(' FIT centroids shape', self.centroids.shape)

        self._y = y
        if self.matching:
            self.matching.fit(X_raw)
        else:
            self.nn.fit(cents)

    def query(self, query, k=None, indices=None):
        centroids = self.centroids

        if k is None:
            k = centroids.shape[0]

        q_centroid = self.vect.transform([query])

        if self.matching:
            ind = self.matching.predict(query)
            centroids, labels = centroids[ind], self._y[ind]
            n_ret = min(k, centroids.shape[0])
            if n_ret == 0:
                return []
            self.nn.fit(centroids)
        elif indices:
            centroids, labels = centroids[ind], self._y[ind]
            n_ret = min(k, centroids.shape[0])
            if n_ret == 0:
                return []
            self.nn.fit(centroids)
        else:
            labels = self._y
            n_ret = k

        ind = self.nn.kneighbors(q_centroid, n_neighbors=n_ret,
                                 return_distance=False)[0]

        return labels[ind]
Example #4
0
class Doc2VecRetrieval(BaseEstimator, RetriEvalMixin):
    def __init__(self,
                 analyzer=None, matching=None,
                 name=None,
                 verbose=0,
                 n_epochs=10,
                 alpha=0.25,
                 min_alpha=0.05,
                 n_jobs=4,
                 **kwargs):
        # self.model = model
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.verbose = verbose
        self.name = "paragraph-vectors" if name is None else name

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))

        self.analyzer = analyzer
        self.model = Doc2Vec(alpha=alpha,
                             min_alpha=alpha,
                             size=500,
                             window=8,
                             min_count=1,
                             sample=1e-5,
                             workers=n_jobs,
                             negative=20,
                             dm=0, dbow_words=1,  # words only with dm!=0?
                             dm_mean=0,  # unused when in concat mode
                             dm_concat=1,
                             dm_tag_count=1
                             )
        self.n_epochs = n_epochs
        self._neighbors = NearestNeighbors(**kwargs)

    def fit(self, docs, y):
        assert len(docs) == len(y)
        model = self.model
        n_epochs = self.n_epochs
        verbose = self.verbose
        decay = (self.alpha - self.min_alpha) / n_epochs
        X = [TaggedDocument(self.analyzer(doc), [label])
             for doc, label in zip(docs, y)]

        if verbose > 0:
            print("First 3 tagged documents:\n", X[:3])
            print("Training doc2vec model")
        # d2v = Doc2Vec()
        # d2v.build_vocab(X)
        # if self.intersect is not None:
        #     d2v.intersect_word2vec_format(self.intersect)
        model.build_vocab(X)
        for epoch in range(n_epochs):
            if verbose:
                print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs))
            model.train(X)
            model.alpha -= decay  # apply global decay
            model.min_alpha = model.alpha  # but no decay inside one epoch

        if verbose > 0:
            print("Finished.")
            print("model:", self.model)

        if self._matching:
            self._matching.fit(docs)
        else:
            # if we dont do matching, its enough to fit a nearest neighbors on
            # all centroids before query time
            dvs = np.asarray([model.docvecs[tag] for tag in y])
            self._neighbors.fit(dvs)

        self._y = y

        return self

    def query(self, query, k=None):
        model, matching = self.model, self._matching
        nn, analyze = self._neighbors, self.analyzer
        verbose = self.verbose
        if k is None:
            k = len(self._centroids)
        if matching:
            matched = matching.predict(query)
            print("Matched:", matched)
            tags = self._y[matched]
            dvs = np.asarray([model.docvecs[tag] for tag in tags])
            n_ret = min(k, len(matched))
            if n_ret == 0:
                return []
            nn.fit(dvs)
        else:
            tags = self._y
            n_ret = k
            # NearestNeighbors are already fit

        if verbose > 0:
            print(len(tags), "documents matched.")
        q = analyze(query)
        qv = model.infer_vector(q).reshape(1, -1)
        ind = nn.kneighbors(qv, n_neighbors=n_ret, return_distance=False)[0]
        y = tags[ind]
        return y
Example #5
0
class WordCentroidRetrieval(BaseEstimator, RetriEvalMixin):
    """
    Retrieval Model based on Word Centroid Distance
    """
    def __init__(self,
                 embedding,
                 analyzer,
                 name="WCD",
                 n_jobs=1,
                 normalize=True,
                 verbose=0,
                 oov=None,
                 matching=True,
                 **kwargs):
        self.name = name
        self._embedding = embedding
        self._normalize = normalize
        self._oov = oov
        self.verbose = verbose
        self.n_jobs = n_jobs
        self._neighbors = NearestNeighbors(**kwargs)

        self._analyzer = analyzer

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))

    def _compute_centroid(self, words):
        if len(words) == 0:  # no words left at all? could also return zeros
            return self._embedding[self._oov]
        E = self._embedding
        embedded_words = np.vstack([E[word] for word in words])
        centroid = np.mean(embedded_words, axis=0).reshape(1, -1)
        return centroid

    def fit(self, docs, labels):
        E, analyze = self._embedding, self._analyzer

        analyzed_docs = (analyze(doc) for doc in docs)
        # out of vocabulary words do not have to contribute to the centroid

        filtered_docs = (filter_vocab(E, d, self._oov) for d in analyzed_docs)
        centroids = np.vstack([self._compute_centroid(doc) for doc in
                               filtered_docs])  # can we generate?
        if self.verbose > 0:
            print("Centroids shape:", centroids.shape)
        if self._normalize:
            normalize(centroids, norm='l2', copy=False)

        self._y = np.asarray(labels)

        if self._matching:
            self._matching.fit(docs)
            self._centroids = centroids
        else:
            # if we dont do matching, its enough to fit a nearest neighbors on
            # all centroids before query time
            self._neighbors.fit(centroids)
        return self

    def query(self, query, k=None, return_distance=False):
        if k is None:
            k = len(self._centroids)
        E, analyze, nn = self._embedding, self._analyzer, self._neighbors
        tokens = analyze(query)
        words = filter_vocab(E, tokens, self._oov)
        query_centroid = self._compute_centroid(words)
        if self._normalize:
            query_centroid = normalize(query_centroid, norm='l2', copy=False)
        if self.verbose > 0:
            print("Analyzed query", words)
            # print("Centered (normalized) query shape", query_centroid.shape)

        if self._matching:
            matched = self._matching.predict(query)
            centroids, labels = self._centroids[matched], self._y[matched]
            if len(centroids) == 0:
                return []  # nothing to fit here
            nn.fit(centroids)
            # k `leq` n_matched
            n_ret = min(k, len(matched))
        else:
            labels = self._y
            n_ret = k

        # either fit nn on the fly or precomputed in own fit method
        dist, ind = nn.kneighbors(query_centroid, n_neighbors=n_ret,
                                  return_distance=True)

        dist, ind = dist[0], ind[0]  # we only had one query in the first place

        if return_distance:
            return labels[ind], dist
        else:
            return labels[ind]