def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. Using this type of batch query is more efficient than computing the similarities one document after another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): logger.warning("non-gensim input must already come normalized") else: if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def similarity(self, d1, d2): """ Compute cosine similarity between two docvecs in the trained set, specified by int index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) """ return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
def n_similarity(self, ds1, ds2): """ Compute cosine similarity between two sets of docvecs from the trained set, specified by int index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) """ v1 = [self[doc] for doc in ds1] v2 = [self[doc] for doc in ds2] return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
def similarity(self, w1, w2): """ Compute cosine similarity between two words. Example:: >>> trained_model.similarity('woman', 'man') 0.73723527 >>> trained_model.similarity('woman', 'woman') 1.0 """ return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
def most_similar(self, positive=[], negative=[], topn=10): """ Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given words, and corresponds to the `word-analogy` and `distance` scripts in the original word2vec implementation. Example:: >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) [('queen', 0.50882536), ...] """ self.init_sims() if isinstance(positive, basestring) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [(word, 1.0) if isinstance(word, basestring) else word for word in positive] negative = [(word, -1.0) if isinstance(word, basestring) else word for word in negative] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if word in self.vocab: mean.append( weight * matutils.unitvec(self.syn0[self.vocab[word].index])) all_words.add(self.vocab[word].index) else: raise KeyError("word '%s' not in vocabulary" % word) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) dists = dot(self.syn0norm, mean) if not topn: return dists best = argsort(dists)[::-1][:topn + len(all_words)] # ignore (don't return) words from the input result = [(self.index2word[sim], dists[sim]) for sim in best if sim not in all_words] return result[:topn]
def doesnt_match(self, words): """ Which word from the given list doesn't go with the others? Example:: >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) 'cereal' """ words = [word for word in words if word in self.vocab] # filter out OOV words logger.debug("using words %s" % words) if not words: raise ValueError("cannot select a word from an empty list") vectors = vstack(matutils.unitvec(self.syn0[self.vocab[word].index]) for word in words).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, words))[0][1]
def sent_vec_similarity(self, sent_id1, sent_id2): """ Compute cosine similarity between two sentences. sent1 and sent2 are the indexs in the train file. Example:: >>> trained_model.sent_vec_similarity(sent_id1, sent_id1) 1.0 >>> trained_model.sent_vec_similarity(sent_id1, sent_id3) 0.73 """ return dot(matutils.unitvec(self.sents[self.sent_no_hash[sent_id1]]), matutils.unitvec(self.sents[self.sent_no_hash[sent_id2]]))
def doesnt_match(self, words): """ Which word from the given list doesn't go with the others? Example:: >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) 'cereal' """ self.init_sims() used_words = [word for word in words if word in self] if len(used_words) != len(words): ignored_words = set(words) - set(used_words) logger.warning( "vectors for words %s are not present in the model, ignoring these words", ignored_words) if not used_words: raise ValueError("cannot select a word from an empty list") vectors = vstack( self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, used_words))[0][1]
def similarity(self, sent1, sent2): """ Compute cosine similarity between two sentences. sent1 and sent2 are the indexs in the train file. Example:: >>> trained_model.similarity(0, 0) 1.0 >>> trained_model.similarity(1, 3) 0.73 """ return dot(matutils.unitvec(self.sents[sent1]), matutils.unitvec(self.sents[sent2]))
def most_similar(self, positive=[], negative=[], topn=10): """ Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given words, and corresponds to the `word-analogy` and `distance` scripts in the original word2vec implementation. Example:: >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) [('queen', 0.50882536), ...] """ self.init_sims() if isinstance(positive, basestring) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [(word, 1.0) if isinstance(word, basestring) else word for word in positive] negative = [(word, -1.0) if isinstance(word, basestring) else word for word in negative] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if word in self.vocab: mean.append(weight * matutils.unitvec(self.syn0[self.vocab[word].index])) all_words.add(self.vocab[word].index) else: raise KeyError("word '%s' not in vocabulary" % word) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) dists = dot(self.syn0norm, mean) if not topn: return dists best = argsort(dists)[::-1][:topn + len(all_words)] # ignore (don't return) words from the input result = [(self.index2word[sim], dists[sim]) for sim in best if sim not in all_words] return result[:topn]
def n_similarity(self, ws1, ws2): """ Compute cosine similarity between two sets of words. Example:: >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) 0.61540466561049689 >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant']) 1.0000000000000004 >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant') True """ v1 = [self[word] for word in ws1] v2 = [self[word] for word in ws2] return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None): """ Find the top-N most similar docvecs known from training. Positive docs contribute positively towards the similarity, negative docs negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given docs. Docs may be specified as vectors, integer indexes of trained docvecs, or if the documents were originally presented with string tags, by the corresponding tags. The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering there was chosen to be significant, such as more popular tag IDs in lower indexes.) """ self.init_sims() clip_end = clip_end or len(self.doctag_syn0norm) if isinstance(positive, string_types + integer_types) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs positive = [ (doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types) else doc for doc in positive ] negative = [ (doc, -1.0) if isinstance(doc, string_types + (ndarray,) + integer_types) else doc for doc in negative ] # compute the weighted average of all docs all_docs, mean = set(), [] for doc, weight in positive + negative: if isinstance(doc, ndarray): mean.append(weight * doc) elif doc in self.doctags or doc < self.count: mean.append(weight * self.doctag_syn0norm[self._int_index(doc)]) all_docs.add(self._int_index(doc)) else: raise KeyError("doc '%s' not in trained set" % doc) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) # ignore (don't return) docs from the input result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs] return result[:topn]
def n_similarity(self, ws1, ws2): """ Compute cosine similarity between two sets of words. Example:: >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) 0.61540466561049689 >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant']) 1.0000000000000004 >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant') True """ if not (len(ws1) and len(ws2)): raise ZeroDivisionError('Atleast one of the passed list is empty.') v1 = [self[word] for word in ws1] v2 = [self[word] for word in ws2] return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
def doesnt_match(self, docs): """ Which doc from the given list doesn't go with the others? (TODO: Accept vectors of out-of-training-set docs, as if from inference.) """ self.init_sims() docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count] # filter out unknowns logger.debug("using docs %s" % docs) if not docs: raise ValueError("cannot select a doc from an empty list") vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, docs))[0][1]
def object_predict(self,subject,relations,topn = 10,chunksize = 1000): """ Find the top-N most similar entities to (subj * rel-1 *rel-2 *...*rel-k) """ self.init_sims_entity() if subject not in self.vocab: raise ValueError('entity %s not in vocabulary' % subject) start_time, next_report = time.time(),[1.0] if isinstance(relations,string_types): relations = [relations] m = ones(self.rel_mat[0].shape) for rel in relations: if rel in self.vocab_rel: m = dot(m,self.rel_mat[self.vocab_rel[rel].index]) obj = dot(self.syn0[self.vocab[subject].index],m) obj = matutils.unitvec(obj).astype(REAL) jobs = Queue(maxsize=2*self.workers) results = Queue() entities_count = [0] lock = threading.Lock() def worker_compute_dists(): while True: start = jobs.get() if start == None: break end = min(start+chunksize,len(self.vocab)) candidates = self.syn0norm[xrange(start,end)] dists = dot(candidates,obj) # topn = topn or end - start best = argsort(dists)[::-1][:topn+1] sub_index = self.vocab[subject].index result = [(self.index2entity_name[sim+start],float(dists[sim])) for sim in best if sim+start != sub_index] results.put(result) # elspsed = time.time() - start_time # with lock: # entities_count[0] += end - start + 1 # if elspsed> next_report[0]: # logger.info("PROGRESS: at %.2f%% entities,%.0f entities/s" % # (100* entities_count[0]/len(self.vocab), entities_count[0]/ elspsed if elspsed else 0.0)) # next_report[0] = elspsed+1.0 workers = [threading.Thread(target=worker_compute_dists,) for _ in range(self.workers)] for thread in workers: thread.daemon = True thread.start() group_num = len(self.vocab)/chunksize + (1 if len(self.vocab)%chunksize!=0 else 0) for start in xrange(group_num): jobs.put(start*chunksize) for _ in xrange(self.workers): jobs.put(None) for thread in workers: thread.join() candidates = [] while not results.empty(): result = results.get_nowait() candidates.extend(result) return sorted(candidates,key=lambda item:item[1],reverse=True)[:topn]
def init_sims(self): if getattr(self, 'syn0norm', None) is None: logger.info("precomputing L2-norms of word weight vectors") self.syn0norm = vstack(matutils.unitvec(vec) for vec in self.syn0).astype(REAL)
def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None): """ Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given words and the vectors for each word in the model. The method corresponds to the `word-analogy` and `distance` scripts in the original word2vec implementation. If topn is False, most_similar returns the vector of similarity scores. `restrict_vocab` is an optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would only check the first 10000 word vectors in the vocabulary order. (This may be meaningful if you've sorted the vocabulary by descending frequency.) Example:: >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) [('queen', 0.50882536), ...] """ self.init_sims() if isinstance(positive, string_types) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [ (word, 1.0) if isinstance(word, string_types + (ndarray, )) else word for word in positive ] negative = [ (word, -1.0) if isinstance(word, string_types + (ndarray, )) else word for word in negative ] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if isinstance(word, ndarray): mean.append(weight * word) else: mean.append(weight * self.word_vec(word, use_norm=True)) if word in self.vocab: all_words.add(self.vocab[word].index) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) if indexer is not None: return indexer.most_similar(mean, topn) limited = self.syn0norm if restrict_vocab is None else self.syn0norm[: restrict_vocab] dists = dot(limited, mean) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn]