def similarity(self, w1, w2): """ Compute cosine similarity between two words. Example:: >>> trained_model.similarity('woman', 'man') 0.73723527 >>> trained_model.similarity('woman', 'woman') 1.0 """ return dot(utils.unitvec(self[w1]), utils.unitvec(self[w2]))
def from_text(cls, fname, vocabUnicodeSize=78): """ Create a WordVectors class based on a word2vec text file Parameters ---------- fname : path to file Returns ------- WordVectors instance """ with open(fname) as fin: header = fin.readline() vocab_size, vector_size = map(int, header.split()) vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize) vectors = np.empty((vocab_size, vector_size), dtype=np.float) for i, line in enumerate(fin): line = line.decode('ISO-8859-1').strip() parts = line.split(' ') word = parts[0] vector = np.array(parts[1:], dtype=np.float) vocab[i] = word vectors[i] = unitvec(vector) return cls(vocab=vocab, vectors=vectors)
def from_binary(cls, fname, vocabUnicodeSize=78): """ Create a WordVectors class based on a word2vec binary file Parameters ---------- fname : path to file Returns ------- WordVectors instance """ with open(fname) as fin: header = fin.readline() vocab_size, vector_size = map(int, header.split()) vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize) vectors = np.empty((vocab_size, vector_size), dtype=np.float) binary_len = np.dtype(np.float32).itemsize * vector_size for i in xrange(vocab_size): # read word word = '' while True: ch = fin.read(1).decode('ISO-8859-1') if ch == ' ': break word += ch vocab[i] = word # read vector vector = np.fromstring(fin.read(binary_len), dtype=np.float32) vectors[i] = unitvec(vector) fin.read(1) # newline return cls(vocab=vocab, vectors=vectors)
def most_similar(self, positive=[], negative=[], topn=10): """ Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively. This method computes cosine similarity between a simple mean of the projection weight vectors of the given words, and corresponds to the `word-analogy` and `distance` scripts in the original word2vec implementation. Example:: >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) [('queen', 0.50882536), ...] """ self.init_sims() if isinstance(positive, basestring) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [(word, 1.0) if isinstance(word, basestring) else word for word in positive] negative = [(word, -1.0) if isinstance(word, basestring) else word for word in negative] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if word in self.vocab: mean.append(weight * utils.unitvec(self.syn0[self.vocab[word].index])) all_words.add(self.vocab[word].index) else: raise KeyError("word '%s' not in vocabulary" % word) if not mean: raise ValueError("cannot compute similarity with no input") mean = utils.unitvec(array(mean).mean(axis=0)).astype(REAL) dists = dot(self.syn0norm, mean) if not topn: return dists best = argsort(dists)[::-1][:topn + len(all_words)] # ignore (don't return) words from the input result = [(self.index2word[sim], dists[sim]) for sim in best if sim not in all_words] return result[:topn]
def doesnt_match(self, words): """ Which word from the given list doesn't go with the others? Example:: >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) 'cereal' """ words = [word for word in words if word in self.vocab] # filter out OOV words logger.debug("using words %s" % words) if not words: raise ValueError("cannot select a word from an empty list") # which word vector representation is furthest away from the mean? vectors = vstack(utils.unitvec(self.syn0[self.vocab[word].index]) for word in words).astype(REAL) mean = utils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, words))[0][1]
def store_vector(self, v, data=None): if self._next_vector == self._batch_size: self.flush() # Store the vector and data locally self._data.append(data) i = self._next_vector self._matrix[i, :] = v self._norm_matrix[i, :] = utils.unitvec(v) self._next_vector += 1
parser.add_argument('-p', '--positive', required=True, help='positive word seeds file') parser.add_argument('-n', '--negative', required=True, help='negative word seeds file') parser.add_argument('-r', '--ratio', required=False, type=float, default=1.0, help='sample ratio') parser.add_argument('-k', '--number', required=False, type=int, default=10, help='number of components to keep') parser.add_argument('-c', '--components', required=True, help='output principal components') parser.add_argument('-s', '--similarity', required=False, default="cosine", help='similarity metric: cosine, dot') parser.add_argument('-i', '--incomponents', required=False, help='input subspace components (.npy)') args = parser.parse_args() vsm = utils.VSM(args.type, args.model, args.incomponents) positive_words = set(line.strip() for line in codecs.open(args.positive,'rb','utf8') if line.strip() in vsm) negative_words = set(line.strip() for line in codecs.open(args.negative,'rb','utf8') if line.strip() in vsm) vsm_array = vsm.get_array(list(positive_words)+list(negative_words)) X = stack(vsm_array) if args.similarity == "cosine": for i in xrange(X.shape[0]): X[i] = utils.unitvec(X[i]) pca = PCA(n_components=args.number) pca.fit(shuffle(X, n_samples=int(len(vsm_array)*args.ratio))) print('explained variance ratio: %s' % str(pca.explained_variance_ratio_)) for i in xrange(args.number): postive_sum = 0 for x in X[0:len(positive_words)]: postive_sum += dot(pca.components_[i], x) if postive_sum < 0: pca.components_[i] = -pca.components_[i] save(args.components, pca.components_)
required=False, default="cosine", help='similarity metric: cosine, dot') args = parser.parse_args() vsm = utils.VSM(args.type, args.model) words = set() pair_dist = {} for line in codecs.open(args.pairs, 'rb', 'utf8'): segs = line.strip().split("\t") if (segs[0], segs[1] ) not in pair_dist and segs[0] in vsm and segs[1] in vsm: dist = vsm[segs[0]] - vsm[segs[1]] pair_dist[(segs[0], segs[1])] = dist pair_dist[(segs[1], segs[0])] = -dist words.add(segs[0]) words.add(segs[1]) print "%d distinct pairs were found (%d word types)." % (len(pair_dist) / 2, len(words)) if args.similarity == "cosine": for key in pair_dist.iterkeys(): pair_dist[key] = utils.unitvec(pair_dist[key]) pca = PCA(n_components=args.number) pca.fit( shuffle(pair_dist.values(), n_samples=int(len(pair_dist) * args.ratio))) print('explained variance ratio: %s' % str(pca.explained_variance_ratio_)) save(args.components, pca.components_)
parser.add_argument('-d', '--debiasing', required=False, type=bool, default=False, help='debiasing: True/False') parser.add_argument('-e', '--neutral', required=False, default="at", help='neutral word') parser.add_argument('-s', '--similarity', required=False, default="cosine", help='similarity metric: cosine, dot') parser.add_argument('-i', '--incomponents', required=False, help='input subspace components (.npy)') args = parser.parse_args() vsm = utils.VSM(args.type, args.model, args.incomponents) u = load(args.coefficient)[0] if args.similarity == "cosine": utils.print_lexical_scores(args.vocabulary, args.debiasing, args.neutral, vsm, \ lambda token: dot(u, utils.unitvec(vsm[token]))) elif args.similarity == "dot": utils.print_lexical_scores(args.vocabulary, args.debiasing, args.neutral, vsm, \ lambda token: dot(u, vsm[token]))