Beispiel #1
0
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. Using this type of batch
        query is more efficient than computing the similarities one document after
        another.
        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if matutils.ismatrix(query):
                logger.warning("non-gensim input must already come normalized")
            else:
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best)
Beispiel #2
0
    def similarity(self, d1, d2):
        """
        Compute cosine similarity between two docvecs in the trained set, specified by int index or
        string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)

        """
        return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
Beispiel #3
0
    def similarity(self, d1, d2):
        """
        Compute cosine similarity between two docvecs in the trained set, specified by int index or
        string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)

        """
        return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
Beispiel #4
0
    def n_similarity(self, ds1, ds2):
        """
        Compute cosine similarity between two sets of docvecs from the trained set, specified by int
        index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)

        """
        v1 = [self[doc] for doc in ds1]
        v2 = [self[doc] for doc in ds2]
        return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
Beispiel #5
0
 def n_similarity(self, ds1, ds2):
     """
     Compute cosine similarity between two sets of docvecs from the trained set, specified by int
     index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)
 
     """
     v1 = [self[doc] for doc in ds1]
     v2 = [self[doc] for doc in ds2]
     return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
Beispiel #6
0
 def similarity(self, w1, w2):
     """
     Compute cosine similarity between two words.
     Example::
       >>> trained_model.similarity('woman', 'man')
       0.73723527
       >>> trained_model.similarity('woman', 'woman')
       1.0
     """
     return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
Beispiel #7
0
    def similarity(self, w1, w2):
        """
        Compute cosine similarity between two words.

        Example::

          >>> trained_model.similarity('woman', 'man')
          0.73723527

          >>> trained_model.similarity('woman', 'woman')
          1.0

        """
        return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
Beispiel #8
0
    def most_similar(self, positive=[], negative=[], topn=10):
        """
        Find the top-N most similar words. Positive words contribute positively towards the
        similarity, negative words negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given words, and corresponds to the `word-analogy` and
        `distance` scripts in the original word2vec implementation.

        Example::

          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
          [('queen', 0.50882536), ...]

        """
        self.init_sims()

        if isinstance(positive, basestring) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [(word, 1.0) if isinstance(word, basestring) else word
                    for word in positive]
        negative = [(word, -1.0) if isinstance(word, basestring) else word
                    for word in negative]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if word in self.vocab:
                mean.append(
                    weight *
                    matutils.unitvec(self.syn0[self.vocab[word].index]))
                all_words.add(self.vocab[word].index)
            else:
                raise KeyError("word '%s' not in vocabulary" % word)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.syn0norm, mean)
        if not topn:
            return dists
        best = argsort(dists)[::-1][:topn + len(all_words)]
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], dists[sim]) for sim in best
                  if sim not in all_words]
        return result[:topn]
Beispiel #9
0
 def doesnt_match(self, words):
     """
     Which word from the given list doesn't go with the others?
     Example::
       >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
       'cereal'
     """
     words = [word for word in words if word in self.vocab]  # filter out OOV words
     logger.debug("using words %s" % words)
     if not words:
         raise ValueError("cannot select a word from an empty list")
     vectors = vstack(matutils.unitvec(self.syn0[self.vocab[word].index]) for word in words).astype(REAL)
     mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
     dists = dot(vectors, mean)
     return sorted(zip(dists, words))[0][1]
Beispiel #10
0
    def sent_vec_similarity(self, sent_id1, sent_id2):
        """
        Compute cosine similarity between two sentences. sent1 and sent2 are
        the indexs in the train file.

        Example::

          >>> trained_model.sent_vec_similarity(sent_id1, sent_id1)
          1.0

          >>> trained_model.sent_vec_similarity(sent_id1, sent_id3)
          0.73

        """
        return dot(matutils.unitvec(self.sents[self.sent_no_hash[sent_id1]]), matutils.unitvec(self.sents[self.sent_no_hash[sent_id2]]))
Beispiel #11
0
    def sent_vec_similarity(self, sent_id1, sent_id2):
        """
        Compute cosine similarity between two sentences. sent1 and sent2 are
        the indexs in the train file.

        Example::

          >>> trained_model.sent_vec_similarity(sent_id1, sent_id1)
          1.0

          >>> trained_model.sent_vec_similarity(sent_id1, sent_id3)
          0.73

        """
        return dot(matutils.unitvec(self.sents[self.sent_no_hash[sent_id1]]),
                   matutils.unitvec(self.sents[self.sent_no_hash[sent_id2]]))
Beispiel #12
0
    def doesnt_match(self, words):
        """
        Which word from the given list doesn't go with the others?

        Example::

          >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
          'cereal'

        """
        self.init_sims()

        used_words = [word for word in words if word in self]
        if len(used_words) != len(words):
            ignored_words = set(words) - set(used_words)
            logger.warning(
                "vectors for words %s are not present in the model, ignoring these words",
                ignored_words)
        if not used_words:
            raise ValueError("cannot select a word from an empty list")
        vectors = vstack(
            self.word_vec(word, use_norm=True)
            for word in used_words).astype(REAL)
        mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
        dists = dot(vectors, mean)
        return sorted(zip(dists, used_words))[0][1]
Beispiel #13
0
    def similarity(self, sent1, sent2):
        """
        Compute cosine similarity between two sentences. sent1 and sent2 are
        the indexs in the train file.

        Example::

          >>> trained_model.similarity(0, 0)
          1.0

          >>> trained_model.similarity(1, 3)
          0.73

        """
        return dot(matutils.unitvec(self.sents[sent1]),
                   matutils.unitvec(self.sents[sent2]))
Beispiel #14
0
    def doesnt_match(self, words):
        """
        Which word from the given list doesn't go with the others?

        Example::

          >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
          'cereal'

        """
        words = [word for word in words if word in self.vocab]  # filter out OOV words
        logger.debug("using words %s" % words)
        if not words:
            raise ValueError("cannot select a word from an empty list")
        vectors = vstack(matutils.unitvec(self.syn0[self.vocab[word].index]) for word in words).astype(REAL)
        mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
        dists = dot(vectors, mean)
        return sorted(zip(dists, words))[0][1]
Beispiel #15
0
    def most_similar(self, positive=[], negative=[], topn=10):
        """
        Find the top-N most similar words. Positive words contribute positively towards the
        similarity, negative words negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given words, and corresponds to the `word-analogy` and
        `distance` scripts in the original word2vec implementation.

        Example::

          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
          [('queen', 0.50882536), ...]

        """
        self.init_sims()

        if isinstance(positive, basestring) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [(word, 1.0) if isinstance(word, basestring) else word for word in positive]
        negative = [(word, -1.0) if isinstance(word, basestring) else word for word in negative]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if word in self.vocab:
                mean.append(weight * matutils.unitvec(self.syn0[self.vocab[word].index]))
                all_words.add(self.vocab[word].index)
            else:
                raise KeyError("word '%s' not in vocabulary" % word)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.syn0norm, mean)
        if not topn:
            return dists
        best = argsort(dists)[::-1][:topn + len(all_words)]
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], dists[sim]) for sim in best if sim not in all_words]
        return result[:topn]
Beispiel #16
0
    def n_similarity(self, ws1, ws2):
        """
        Compute cosine similarity between two sets of words.

        Example::

          >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
          0.61540466561049689

          >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
          1.0000000000000004

          >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
          True

        """
        v1 = [self[word] for word in ws1]
        v2 = [self[word] for word in ws2]
        return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
Beispiel #17
0
    def n_similarity(self, ws1, ws2):
        """
        Compute cosine similarity between two sets of words.

        Example::

          >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
          0.61540466561049689

          >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
          1.0000000000000004

          >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
          True

        """
        v1 = [self[word] for word in ws1]
        v2 = [self[word] for word in ws2]
        return dot(matutils.unitvec(array(v1).mean(axis=0)),
                   matutils.unitvec(array(v2).mean(axis=0)))
Beispiel #18
0
    def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None):
        """
        Find the top-N most similar docvecs known from training. Positive docs contribute
        positively towards the similarity, negative docs negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given docs. Docs may be specified as vectors, integer indexes
        of trained docvecs, or if the documents were originally presented with string tags,
        by the corresponding tags.

        The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous
        range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering
        there was chosen to be significant, such as more popular tag IDs in lower indexes.)
        """
        self.init_sims()
        clip_end = clip_end or len(self.doctag_syn0norm)

        if isinstance(positive, string_types + integer_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs
        positive = [
            (doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
            else doc for doc in positive
        ]
        negative = [
            (doc, -1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
            else doc for doc in negative
        ]

        # compute the weighted average of all docs
        all_docs, mean = set(), []
        for doc, weight in positive + negative:
            if isinstance(doc, ndarray):
                mean.append(weight * doc)
            elif doc in self.doctags or doc < self.count:
                mean.append(weight * self.doctag_syn0norm[self._int_index(doc)])
                all_docs.add(self._int_index(doc))
            else:
                raise KeyError("doc '%s' not in trained set" % doc)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean)
        if not topn:
            return dists
        best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True)
        # ignore (don't return) docs from the input
        result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs]
        return result[:topn]
Beispiel #19
0
 def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None):
     """
     Find the top-N most similar docvecs known from training. Positive docs contribute
     positively towards the similarity, negative docs negatively.
 
     This method computes cosine similarity between a simple mean of the projection
     weight vectors of the given docs. Docs may be specified as vectors, integer indexes
     of trained docvecs, or if the documents were originally presented with string tags,
     by the corresponding tags.
 
     The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous
     range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering
     there was chosen to be significant, such as more popular tag IDs in lower indexes.)
     """
     self.init_sims()
     clip_end = clip_end or len(self.doctag_syn0norm)
 
     if isinstance(positive, string_types + integer_types) and not negative:
         # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
         positive = [positive]
 
     # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs
     positive = [
         (doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
         else doc for doc in positive
     ]
     negative = [
         (doc, -1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
         else doc for doc in negative
     ]
 
     # compute the weighted average of all docs
     all_docs, mean = set(), []
     for doc, weight in positive + negative:
         if isinstance(doc, ndarray):
             mean.append(weight * doc)
         elif doc in self.doctags or doc < self.count:
             mean.append(weight * self.doctag_syn0norm[self._int_index(doc)])
             all_docs.add(self._int_index(doc))
         else:
             raise KeyError("doc '%s' not in trained set" % doc)
     if not mean:
         raise ValueError("cannot compute similarity with no input")
     mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
 
     dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean)
     if not topn:
         return dists
     best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True)
     # ignore (don't return) docs from the input
     result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs]
     return result[:topn]
Beispiel #20
0
    def n_similarity(self, ws1, ws2):
        """
        Compute cosine similarity between two sets of words.

        Example::

          >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
          0.61540466561049689

          >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
          1.0000000000000004

          >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
          True

        """
        if not (len(ws1) and len(ws2)):
            raise ZeroDivisionError('Atleast one of the passed list is empty.')
        v1 = [self[word] for word in ws1]
        v2 = [self[word] for word in ws2]
        return dot(matutils.unitvec(array(v1).mean(axis=0)),
                   matutils.unitvec(array(v2).mean(axis=0)))
Beispiel #21
0
 def doesnt_match(self, docs):
     """
     Which doc from the given list doesn't go with the others?
 
     (TODO: Accept vectors of out-of-training-set docs, as if from inference.)
 
     """
     self.init_sims()
 
     docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count]  # filter out unknowns
     logger.debug("using docs %s" % docs)
     if not docs:
         raise ValueError("cannot select a doc from an empty list")
     vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL)
     mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
     dists = dot(vectors, mean)
     return sorted(zip(dists, docs))[0][1]
Beispiel #22
0
    def doesnt_match(self, docs):
        """
        Which doc from the given list doesn't go with the others?

        (TODO: Accept vectors of out-of-training-set docs, as if from inference.)

        """
        self.init_sims()

        docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count]  # filter out unknowns
        logger.debug("using docs %s" % docs)
        if not docs:
            raise ValueError("cannot select a doc from an empty list")
        vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL)
        mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
        dists = dot(vectors, mean)
        return sorted(zip(dists, docs))[0][1]
Beispiel #23
0
    def object_predict(self,subject,relations,topn = 10,chunksize = 1000):
        """
        Find the top-N most similar entities to (subj * rel-1 *rel-2 *...*rel-k)

        """
        self.init_sims_entity()

        if subject not in self.vocab:
            raise ValueError('entity %s not in vocabulary' % subject)

        start_time, next_report = time.time(),[1.0]
        if isinstance(relations,string_types):
            relations = [relations]
        m = ones(self.rel_mat[0].shape)
        for rel in relations:
            if rel in self.vocab_rel:
                m = dot(m,self.rel_mat[self.vocab_rel[rel].index])
        obj = dot(self.syn0[self.vocab[subject].index],m)
        obj = matutils.unitvec(obj).astype(REAL)
        jobs = Queue(maxsize=2*self.workers)
        results = Queue()
        entities_count = [0]
        lock = threading.Lock()

        def worker_compute_dists():
            while True:
                start = jobs.get()
                if start == None:
                    break
                end = min(start+chunksize,len(self.vocab))
                candidates = self.syn0norm[xrange(start,end)]
                dists = dot(candidates,obj)

                # topn = topn or end - start

                best = argsort(dists)[::-1][:topn+1]

                sub_index = self.vocab[subject].index
                result = [(self.index2entity_name[sim+start],float(dists[sim])) for sim in best if sim+start != sub_index]
                results.put(result)
                # elspsed = time.time() - start_time
                # with lock:
                #     entities_count[0] += end - start + 1
                #     if elspsed> next_report[0]:
                #         logger.info("PROGRESS: at %.2f%% entities,%.0f entities/s" %
                #                     (100* entities_count[0]/len(self.vocab), entities_count[0]/ elspsed if elspsed else 0.0))
                #         next_report[0] = elspsed+1.0




        workers = [threading.Thread(target=worker_compute_dists,) for _ in range(self.workers)]

        for thread in workers:
            thread.daemon = True
            thread.start()


        group_num = len(self.vocab)/chunksize + (1 if len(self.vocab)%chunksize!=0 else 0)
        for start in xrange(group_num):
            jobs.put(start*chunksize)
        for _ in xrange(self.workers):
            jobs.put(None)
        for thread in workers:
            thread.join()
        candidates = []
        while not results.empty():

            result = results.get_nowait()
            candidates.extend(result)
        return sorted(candidates,key=lambda item:item[1],reverse=True)[:topn]
Beispiel #24
0
 def init_sims(self):
     if getattr(self, 'syn0norm', None) is None:
         logger.info("precomputing L2-norms of word weight vectors")
         self.syn0norm = vstack(matutils.unitvec(vec)
                                for vec in self.syn0).astype(REAL)
Beispiel #25
0
    def most_similar(self,
                     positive=[],
                     negative=[],
                     topn=10,
                     restrict_vocab=None,
                     indexer=None):
        """
        Find the top-N most similar words. Positive words contribute positively towards the
        similarity, negative words negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given words and the vectors for each word in the model.
        The method corresponds to the `word-analogy` and `distance` scripts in the original
        word2vec implementation.

        If topn is False, most_similar returns the vector of similarity scores.

        `restrict_vocab` is an optional integer which limits the range of vectors which
        are searched for most-similar values. For example, restrict_vocab=10000 would
        only check the first 10000 word vectors in the vocabulary order. (This may be
        meaningful if you've sorted the vocabulary by descending frequency.)

        Example::

          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
          [('queen', 0.50882536), ...]

        """
        self.init_sims()

        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [
            (word,
             1.0) if isinstance(word, string_types + (ndarray, )) else word
            for word in positive
        ]
        negative = [
            (word,
             -1.0) if isinstance(word, string_types + (ndarray, )) else word
            for word in negative
        ]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if isinstance(word, ndarray):
                mean.append(weight * word)
            else:
                mean.append(weight * self.word_vec(word, use_norm=True))
                if word in self.vocab:
                    all_words.add(self.vocab[word].index)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        if indexer is not None:
            return indexer.most_similar(mean, topn)

        limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:
                                                                             restrict_vocab]
        dists = dot(limited, mean)
        if not topn:
            return dists
        best = matutils.argsort(dists,
                                topn=topn + len(all_words),
                                reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best
                  if sim not in all_words]
        return result[:topn]
Beispiel #26
0
 def init_sims(self):
     if getattr(self, 'syn0norm', None) is None:
         logger.info("precomputing L2-norms of word weight vectors")
         self.syn0norm = vstack(matutils.unitvec(vec) for vec in self.syn0).astype(REAL)