Example #1
0
    def similarity(self, w1, w2):
        """
        Compute cosine similarity between two words.

        Example::

          >>> trained_model.similarity('woman', 'man')
          0.73723527

          >>> trained_model.similarity('woman', 'woman')
          1.0

        """
        return dot(utils.unitvec(self[w1]), utils.unitvec(self[w2]))
Example #2
0
    def from_text(cls, fname, vocabUnicodeSize=78):
        """
        Create a WordVectors class based on a word2vec text file

        Parameters
        ----------
        fname : path to file

        Returns
        -------
        WordVectors instance
        """
        with open(fname) as fin:
            header = fin.readline()
            vocab_size, vector_size = map(int, header.split())

            vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            for i, line in enumerate(fin):
                line = line.decode('ISO-8859-1').strip()
                parts = line.split(' ')
                word = parts[0]
                vector = np.array(parts[1:], dtype=np.float)
                vocab[i] = word
                vectors[i] = unitvec(vector)

        return cls(vocab=vocab, vectors=vectors)
Example #3
0
    def from_binary(cls, fname, vocabUnicodeSize=78):
        """
        Create a WordVectors class based on a word2vec binary file

        Parameters
        ----------
        fname : path to file

        Returns
        -------
        WordVectors instance
        """
        with open(fname) as fin:
            header = fin.readline()
            vocab_size, vector_size = map(int, header.split())

            vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            binary_len = np.dtype(np.float32).itemsize * vector_size
            for i in xrange(vocab_size):
                # read word
                word = ''
                while True:
                    ch = fin.read(1).decode('ISO-8859-1')
                    if ch == ' ':
                        break
                    word += ch
                vocab[i] = word

                # read vector
                vector = np.fromstring(fin.read(binary_len), dtype=np.float32)
                vectors[i] = unitvec(vector)
                fin.read(1)  # newline

        return cls(vocab=vocab, vectors=vectors)
Example #4
0
    def most_similar(self, positive=[], negative=[], topn=10):
        """
        Find the top-N most similar words. Positive words contribute positively towards the
        similarity, negative words negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given words, and corresponds to the `word-analogy` and
        `distance` scripts in the original word2vec implementation.

        Example::

          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
          [('queen', 0.50882536), ...]

        """
        self.init_sims()

        if isinstance(positive, basestring) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [(word, 1.0) if isinstance(word, basestring) else word for word in positive]
        negative = [(word, -1.0) if isinstance(word, basestring) else word for word in negative]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if word in self.vocab:
                mean.append(weight * utils.unitvec(self.syn0[self.vocab[word].index]))
                all_words.add(self.vocab[word].index)
            else:
                raise KeyError("word '%s' not in vocabulary" % word)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = utils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.syn0norm, mean)
        if not topn:
            return dists
        best = argsort(dists)[::-1][:topn + len(all_words)]
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], dists[sim]) for sim in best if sim not in all_words]
        return result[:topn]
Example #5
0
    def doesnt_match(self, words):
        """
        Which word from the given list doesn't go with the others?

        Example::

          >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
          'cereal'

        """
        words = [word for word in words if word in self.vocab]  # filter out OOV words
        logger.debug("using words %s" % words)
        if not words:
            raise ValueError("cannot select a word from an empty list")
        # which word vector representation is furthest away from the mean?
        vectors = vstack(utils.unitvec(self.syn0[self.vocab[word].index]) for word in words).astype(REAL)
        mean = utils.unitvec(vectors.mean(axis=0)).astype(REAL)
        dists = dot(vectors, mean)
        return sorted(zip(dists, words))[0][1]
Example #6
0
    def store_vector(self, v, data=None):
        if self._next_vector == self._batch_size:
            self.flush()

        # Store the vector and data locally
        self._data.append(data)
        i = self._next_vector
        self._matrix[i, :] = v
        self._norm_matrix[i, :] = utils.unitvec(v)
        self._next_vector += 1
    def store_vector(self, v, data=None):
        if self._next_vector == self._batch_size:
            self.flush()

        # Store the vector and data locally
        self._data.append(data)
        i = self._next_vector
        self._matrix[i, :] = v
        self._norm_matrix[i, :] = utils.unitvec(v)
        self._next_vector += 1
 parser.add_argument('-p', '--positive', required=True, help='positive word seeds file')
 parser.add_argument('-n', '--negative', required=True, help='negative word seeds file')
 parser.add_argument('-r', '--ratio', required=False, type=float, default=1.0, help='sample ratio')
 parser.add_argument('-k', '--number', required=False, type=int, default=10, help='number of components to keep')
 parser.add_argument('-c', '--components', required=True, help='output principal components')
 parser.add_argument('-s', '--similarity', required=False, default="cosine", help='similarity metric: cosine, dot')
 parser.add_argument('-i', '--incomponents', required=False, help='input subspace components (.npy)')
 args = parser.parse_args()
 
 vsm = utils.VSM(args.type, args.model, args.incomponents)
 positive_words = set(line.strip() for line in codecs.open(args.positive,'rb','utf8') if line.strip() in vsm)
 negative_words = set(line.strip() for line in codecs.open(args.negative,'rb','utf8') if line.strip() in vsm)
 vsm_array = vsm.get_array(list(positive_words)+list(negative_words))
 X = stack(vsm_array)
 
 if args.similarity == "cosine":
     for i in xrange(X.shape[0]):
         X[i] = utils.unitvec(X[i])
 
 pca = PCA(n_components=args.number)
 pca.fit(shuffle(X, n_samples=int(len(vsm_array)*args.ratio)))
 print('explained variance ratio: %s' % str(pca.explained_variance_ratio_))
 
 for i in xrange(args.number):
     postive_sum = 0
     for x in X[0:len(positive_words)]:
         postive_sum += dot(pca.components_[i], x)
     if postive_sum < 0:
         pca.components_[i] = -pca.components_[i]
 
 save(args.components, pca.components_)
Example #9
0
                        required=False,
                        default="cosine",
                        help='similarity metric: cosine, dot')
    args = parser.parse_args()

    vsm = utils.VSM(args.type, args.model)
    words = set()
    pair_dist = {}
    for line in codecs.open(args.pairs, 'rb', 'utf8'):
        segs = line.strip().split("\t")
        if (segs[0], segs[1]
            ) not in pair_dist and segs[0] in vsm and segs[1] in vsm:
            dist = vsm[segs[0]] - vsm[segs[1]]
            pair_dist[(segs[0], segs[1])] = dist
            pair_dist[(segs[1], segs[0])] = -dist
            words.add(segs[0])
            words.add(segs[1])
    print "%d distinct pairs were found (%d word types)." % (len(pair_dist) /
                                                             2, len(words))

    if args.similarity == "cosine":
        for key in pair_dist.iterkeys():
            pair_dist[key] = utils.unitvec(pair_dist[key])

    pca = PCA(n_components=args.number)
    pca.fit(
        shuffle(pair_dist.values(),
                n_samples=int(len(pair_dist) * args.ratio)))
    print('explained variance ratio: %s' % str(pca.explained_variance_ratio_))

    save(args.components, pca.components_)
Example #10
0
    parser.add_argument('-d',
                        '--debiasing',
                        required=False,
                        type=bool,
                        default=False,
                        help='debiasing: True/False')
    parser.add_argument('-e',
                        '--neutral',
                        required=False,
                        default="at",
                        help='neutral word')
    parser.add_argument('-s',
                        '--similarity',
                        required=False,
                        default="cosine",
                        help='similarity metric: cosine, dot')
    parser.add_argument('-i',
                        '--incomponents',
                        required=False,
                        help='input subspace components (.npy)')
    args = parser.parse_args()

    vsm = utils.VSM(args.type, args.model, args.incomponents)
    u = load(args.coefficient)[0]

    if args.similarity == "cosine":
        utils.print_lexical_scores(args.vocabulary, args.debiasing, args.neutral, vsm, \
                                   lambda token: dot(u, utils.unitvec(vsm[token])))
    elif args.similarity == "dot":
        utils.print_lexical_scores(args.vocabulary, args.debiasing, args.neutral, vsm, \
                                   lambda token: dot(u, vsm[token]))