class FastWordCentroidRetrieval(BaseEstimator, RetriEvalMixin): """Docstring for FastWordCentrodRetrieval. """ def __init__(self, embedding, analyzer='word', matching=None, name="FWCD", n_jobs=1, use_idf=True): """TODO: to be defined1. """ self.name = name self.matching = Matching(**dict(matching)) if matching else None self.vect = EmbeddedVectorizer(embedding, analyzer=analyzer, norm='l2', use_idf=use_idf) self.nn = NearestNeighbors(n_jobs=n_jobs, metric='cosine', algorithm='brute') def fit(self, X_raw, y=None): cents = self.vect.fit_transform(X_raw) # print("Largest singular value: {:.2f}".format( # np.linalg.norm(cents, ord=2))) # cents = all_but_the_top(cents, 1) # print("Largest singular value: {:.2f}".format( # np.linalg.norm(cents, ord=2))) # print("Renormalizing") # normalize(cents, copy=False) # print("Largest singular value: {:.2f}".format( # np.linalg.norm(cents, ord=2))) self.centroids = cents print(' FIT centroids shape', self.centroids.shape) self._y = y if self.matching: self.matching.fit(X_raw) else: self.nn.fit(cents) def query(self, query, k=None, indices=None): centroids = self.centroids if k is None: k = centroids.shape[0] q_centroid = self.vect.transform([query]) if self.matching: ind = self.matching.predict(query) centroids, labels = centroids[ind], self._y[ind] n_ret = min(k, centroids.shape[0]) if n_ret == 0: return [] self.nn.fit(centroids) elif indices: centroids, labels = centroids[ind], self._y[ind] n_ret = min(k, centroids.shape[0]) if n_ret == 0: return [] self.nn.fit(centroids) else: labels = self._y n_ret = k ind = self.nn.kneighbors(q_centroid, n_neighbors=n_ret, return_distance=False)[0] return labels[ind]
class WordCentroidRetrieval(BaseEstimator, RetriEvalMixin): """ Retrieval Model based on Word Centroid Distance """ def __init__(self, embedding, analyzer, name="WCD", n_jobs=1, normalize=True, verbose=0, oov=None, matching=True, **kwargs): self.name = name self._embedding = embedding self._normalize = normalize self._oov = oov self.verbose = verbose self.n_jobs = n_jobs self._neighbors = NearestNeighbors(**kwargs) self._analyzer = analyzer if matching is True: self._matching = Matching() elif matching is False or matching is None: self._matching = None else: self._matching = Matching(**dict(matching)) def _compute_centroid(self, words): if len(words) == 0: # no words left at all? could also return zeros return self._embedding[self._oov] E = self._embedding embedded_words = np.vstack([E[word] for word in words]) centroid = np.mean(embedded_words, axis=0).reshape(1, -1) return centroid def fit(self, docs, labels): E, analyze = self._embedding, self._analyzer analyzed_docs = (analyze(doc) for doc in docs) # out of vocabulary words do not have to contribute to the centroid filtered_docs = (filter_vocab(E, d, self._oov) for d in analyzed_docs) centroids = np.vstack([self._compute_centroid(doc) for doc in filtered_docs]) # can we generate? if self.verbose > 0: print("Centroids shape:", centroids.shape) if self._normalize: normalize(centroids, norm='l2', copy=False) self._y = np.asarray(labels) if self._matching: self._matching.fit(docs) self._centroids = centroids else: # if we dont do matching, its enough to fit a nearest neighbors on # all centroids before query time self._neighbors.fit(centroids) return self def query(self, query, k=None, return_distance=False): if k is None: k = len(self._centroids) E, analyze, nn = self._embedding, self._analyzer, self._neighbors tokens = analyze(query) words = filter_vocab(E, tokens, self._oov) query_centroid = self._compute_centroid(words) if self._normalize: query_centroid = normalize(query_centroid, norm='l2', copy=False) if self.verbose > 0: print("Analyzed query", words) # print("Centered (normalized) query shape", query_centroid.shape) if self._matching: matched = self._matching.predict(query) centroids, labels = self._centroids[matched], self._y[matched] if len(centroids) == 0: return [] # nothing to fit here nn.fit(centroids) # k `leq` n_matched n_ret = min(k, len(matched)) else: labels = self._y n_ret = k # either fit nn on the fly or precomputed in own fit method dist, ind = nn.kneighbors(query_centroid, n_neighbors=n_ret, return_distance=True) dist, ind = dist[0], ind[0] # we only had one query in the first place if return_distance: return labels[ind], dist else: return labels[ind]
class Doc2VecRetrieval(BaseEstimator, RetriEvalMixin): def __init__(self, analyzer=None, matching=None, name=None, verbose=0, n_epochs=10, alpha=0.25, min_alpha=0.05, n_jobs=4, **kwargs): # self.model = model self.alpha = alpha self.min_alpha = min_alpha self.verbose = verbose self.name = "paragraph-vectors" if name is None else name if matching is True: self._matching = Matching() elif matching is False or matching is None: self._matching = None else: self._matching = Matching(**dict(matching)) self.analyzer = analyzer self.model = Doc2Vec(alpha=alpha, min_alpha=alpha, size=500, window=8, min_count=1, sample=1e-5, workers=n_jobs, negative=20, dm=0, dbow_words=1, # words only with dm!=0? dm_mean=0, # unused when in concat mode dm_concat=1, dm_tag_count=1 ) self.n_epochs = n_epochs self._neighbors = NearestNeighbors(**kwargs) def fit(self, docs, y): assert len(docs) == len(y) model = self.model n_epochs = self.n_epochs verbose = self.verbose decay = (self.alpha - self.min_alpha) / n_epochs X = [TaggedDocument(self.analyzer(doc), [label]) for doc, label in zip(docs, y)] if verbose > 0: print("First 3 tagged documents:\n", X[:3]) print("Training doc2vec model") # d2v = Doc2Vec() # d2v.build_vocab(X) # if self.intersect is not None: # d2v.intersect_word2vec_format(self.intersect) model.build_vocab(X) for epoch in range(n_epochs): if verbose: print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs)) model.train(X) model.alpha -= decay # apply global decay model.min_alpha = model.alpha # but no decay inside one epoch if verbose > 0: print("Finished.") print("model:", self.model) if self._matching: self._matching.fit(docs) else: # if we dont do matching, its enough to fit a nearest neighbors on # all centroids before query time dvs = np.asarray([model.docvecs[tag] for tag in y]) self._neighbors.fit(dvs) self._y = y return self def query(self, query, k=None): model, matching = self.model, self._matching nn, analyze = self._neighbors, self.analyzer verbose = self.verbose if k is None: k = len(self._centroids) if matching: matched = matching.predict(query) print("Matched:", matched) tags = self._y[matched] dvs = np.asarray([model.docvecs[tag] for tag in tags]) n_ret = min(k, len(matched)) if n_ret == 0: return [] nn.fit(dvs) else: tags = self._y n_ret = k # NearestNeighbors are already fit if verbose > 0: print(len(tags), "documents matched.") q = analyze(query) qv = model.infer_vector(q).reshape(1, -1) ind = nn.kneighbors(qv, n_neighbors=n_ret, return_distance=False)[0] y = tags[ind] return y