Esempio n. 1
0
    def most_similar_cosmul(self, positive=[], negative=[], topn=10):
        """
        Find the top-N most similar words, using the multiplicative combination objective
        proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute
        positively towards the similarity, negative words negatively, but with less
        susceptibility to one large distance dominating the calculation.

        In the common analogy-solving case, of two positive and one negative examples,
        this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg.

        Additional positive or negative examples contribute to the numerator or denominator,
        respectively – a potentially sensible but untested extension of the method. (With
        a single positive example, rankings will be the same as in the default most_similar.)

        Example::

          >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
          [(u'iraq', 0.8488819003105164), ...]

        .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.

        """
        self.init_sims()

        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
            positive = [positive]

        all_words = set([
            self.vocab[word].index for word in positive + negative
            if not isinstance(word, ndarray) and word in self.vocab
        ])

        positive = [
            self.word_vec(word, use_norm=True) if isinstance(
                word, string_types) else word for word in positive
        ]
        negative = [
            self.word_vec(word, use_norm=True) if isinstance(
                word, string_types) else word for word in negative
        ]

        if not positive:
            raise ValueError("cannot compute similarity with no input")

        # equation (4) of Levy & Goldberg "Linguistic Regularities...",
        # with distances shifted to [0,1] per footnote (7)
        pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
        neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative]
        dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001)

        if not topn:
            return dists
        best = matutils.argsort(dists,
                                topn=topn + len(all_words),
                                reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best
                  if sim not in all_words]
        return result[:topn]
Esempio n. 2
0
def find_nn(i, syn0norm):
    #print(i)
    topn = 10
    dists = dot(syn0norm, syn0norm[i])
    if not topn:
        return dists
    best = matutils.argsort(dists, topn=topn + 1, reverse=True)
    # ignore (don't return) docs from the input
    result = [(sim, float(dists[sim])) for sim in best[1:]]
    return result[:topn]
Esempio n. 3
0
def find_nn_idx(i, syn0norm, result_idxes):
    #print(i)
    topn = 10
    dists = dot(syn0norm, syn0norm[i])
    if not topn:
        return dists
    best = matutils.argsort(dists, topn=topn + 1, reverse=True)
    # ignore (don't return) docs from the input
    result = best[1:]
    result_idxes = np.array(result[:topn], copy=True)
    return result[:topn]
Esempio n. 4
0
    def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None):
        """
        Find the top-N most similar docvecs known from training. Positive docs contribute
        positively towards the similarity, negative docs negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given docs. Docs may be specified as vectors, integer indexes
        of trained docvecs, or if the documents were originally presented with string tags,
        by the corresponding tags.

        The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous
        range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering
        there was chosen to be significant, such as more popular tag IDs in lower indexes.)
        """
        self.init_sims()
        clip_end = clip_end or len(self.doctag_syn0norm)

        if isinstance(positive, string_types + integer_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs
        positive = [
            (doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
            else doc for doc in positive
        ]
        negative = [
            (doc, -1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
            else doc for doc in negative
        ]

        # compute the weighted average of all docs
        all_docs, mean = set(), []
        for doc, weight in positive + negative:
            if isinstance(doc, ndarray):
                mean.append(weight * doc)
            elif doc in self.doctags or doc < self.count:
                mean.append(weight * self.doctag_syn0norm[self._int_index(doc)])
                all_docs.add(self._int_index(doc))
            else:
                raise KeyError("doc '%s' not in trained set" % doc)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean)
        if not topn:
            return dists
        best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True)
        # ignore (don't return) docs from the input
        result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs]
        return result[:topn]
Esempio n. 5
0
 def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None):
     """
     Find the top-N most similar docvecs known from training. Positive docs contribute
     positively towards the similarity, negative docs negatively.
 
     This method computes cosine similarity between a simple mean of the projection
     weight vectors of the given docs. Docs may be specified as vectors, integer indexes
     of trained docvecs, or if the documents were originally presented with string tags,
     by the corresponding tags.
 
     The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous
     range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering
     there was chosen to be significant, such as more popular tag IDs in lower indexes.)
     """
     self.init_sims()
     clip_end = clip_end or len(self.doctag_syn0norm)
 
     if isinstance(positive, string_types + integer_types) and not negative:
         # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
         positive = [positive]
 
     # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs
     positive = [
         (doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
         else doc for doc in positive
     ]
     negative = [
         (doc, -1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
         else doc for doc in negative
     ]
 
     # compute the weighted average of all docs
     all_docs, mean = set(), []
     for doc, weight in positive + negative:
         if isinstance(doc, ndarray):
             mean.append(weight * doc)
         elif doc in self.doctags or doc < self.count:
             mean.append(weight * self.doctag_syn0norm[self._int_index(doc)])
             all_docs.add(self._int_index(doc))
         else:
             raise KeyError("doc '%s' not in trained set" % doc)
     if not mean:
         raise ValueError("cannot compute similarity with no input")
     mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
 
     dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean)
     if not topn:
         return dists
     best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True)
     # ignore (don't return) docs from the input
     result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs]
     return result[:topn]
Esempio n. 6
0
    def accuracy(self,
                 questions,
                 restrict_vocab=30000,
                 most_similar=most_similar,
                 case_insensitive=True):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See questions-words.txt in https://storage.googleapis.com/google-model-archive-source/v2/model.google.com/word2vec/source-archive.zip for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
        words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
        In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
        case normalization is performed.

        Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
        evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
        and question words. In case of multiple case variants of a single word, the vector for the first
        occurrence (also the most frequent if vocabulary is sorted) is taken.

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        ok_vocab = [(w, self.vocab[w])
                    for w in self.index2word[:restrict_vocab]]
        ok_vocab = dict((w.upper(), v) for w, v in reversed(
            ok_vocab)) if case_insensitive else dict(ok_vocab)

        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self.log_accuracy(section)
                section = {
                    'section': line.lstrip(': ').strip(),
                    'correct': [],
                    'incorrect': []
                }
            else:
                if not section:
                    raise ValueError(
                        "missing section header before line #%i in %s" %
                        (line_no, questions))
                try:
                    if case_insensitive:
                        a, b, c, expected = [
                            word.upper() for word in line.split()
                        ]
                    else:
                        a, b, c, expected = [word for word in line.split()]
                except:
                    logger.info("skipping invalid line #%i in %s" %
                                (line_no, questions))
                    continue
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" %
                                 (line_no, line.strip()))
                    continue

                original_vocab = self.vocab
                self.vocab = ok_vocab
                ignore = set([a, b, c])  # input words to be ignored
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                sims = most_similar(self,
                                    positive=[b, c],
                                    negative=[a],
                                    topn=False,
                                    restrict_vocab=restrict_vocab)
                self.vocab = original_vocab
                for index in matutils.argsort(sims, reverse=True):
                    predicted = self.index2word[index].upper(
                    ) if case_insensitive else self.index2word[index]
                    if predicted in ok_vocab and predicted not in ignore:
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s",
                                         line.strip(), expected, predicted)
                        break
                if predicted == expected:
                    section['correct'].append((a, b, c, expected))
                else:
                    section['incorrect'].append((a, b, c, expected))
        if section:
            # store the last section, too
            sections.append(section)
            self.log_accuracy(section)

        total = {
            'section': 'total',
            'correct': sum((s['correct'] for s in sections), []),
            'incorrect': sum((s['incorrect'] for s in sections), []),
        }
        self.log_accuracy(total)
        sections.append(total)
        return sections
Esempio n. 7
0
    def most_similar(self,
                     positive=[],
                     negative=[],
                     topn=10,
                     restrict_vocab=None,
                     indexer=None):
        """
        Find the top-N most similar words. Positive words contribute positively towards the
        similarity, negative words negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given words and the vectors for each word in the model.
        The method corresponds to the `word-analogy` and `distance` scripts in the original
        word2vec implementation.

        If topn is False, most_similar returns the vector of similarity scores.

        `restrict_vocab` is an optional integer which limits the range of vectors which
        are searched for most-similar values. For example, restrict_vocab=10000 would
        only check the first 10000 word vectors in the vocabulary order. (This may be
        meaningful if you've sorted the vocabulary by descending frequency.)

        Example::

          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
          [('queen', 0.50882536), ...]

        """
        self.init_sims()

        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [
            (word,
             1.0) if isinstance(word, string_types + (ndarray, )) else word
            for word in positive
        ]
        negative = [
            (word,
             -1.0) if isinstance(word, string_types + (ndarray, )) else word
            for word in negative
        ]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if isinstance(word, ndarray):
                mean.append(weight * word)
            else:
                mean.append(weight * self.word_vec(word, use_norm=True))
                if word in self.vocab:
                    all_words.add(self.vocab[word].index)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        if indexer is not None:
            return indexer.most_similar(mean, topn)

        limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:
                                                                             restrict_vocab]
        dists = dot(limited, mean)
        if not topn:
            return dists
        best = matutils.argsort(dists,
                                topn=topn + len(all_words),
                                reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best
                  if sim not in all_words]
        return result[:topn]