def main():
    """
    prints list of words sorted according to their changes between two points of time
    """
    if len(sys.argv) != 4 and len(sys.argv) != 3:
        raise Exception(
            "Provide 2+ arguments:\n\t1,first model\n\t2,second model\n\t3,Optional: number of min occurrences")
    start = sys.argv[1]
    end = sys.argv[2]
    if len(sys.argv) == 4:
        min_occ = int(sys.argv[3])
    else:
        min_occ = 0

    model1 = gensim.models.Word2Vec.load(start)
    model2 = gensim.models.Word2Vec.load(end)

    similarity = {}

    for word in model1.vocab:
        if model1.vocab[word].count >= min_occ and word in model2.vocab and model2.vocab[word].count >= min_occ:
            similarity[word] = dot(matutils.unitvec(
                model1[word]), matutils.unitvec(model2[word]))

    for w, c in sorted(similarity.items(), key=itemgetter(1)):
        print(w, c)
Example #2
0
    def lineReceived(self, line):
        #print( "LSA received " + line );
        try:
            sent1, sent2 = line.strip().split("\t")
        except:
            self.sendLine("INPUTERROR: missing tab character?")
            return

        #print( "LSA sentence 1: " + sent1 )
        #print( "LSA sentence 2: " + sent2 )
        try:
            vec_bow1 = dictionary.doc2bow( sent1.lower().split())
            vec_bow2 = dictionary.doc2bow( sent2.lower().split())
            vec_lsi1 = lsi[vec_bow1]
            vec_lsi2 = lsi[vec_bow2]
        except KeyError:
            self.sendLine(str(0))
            return
        if not vec_lsi1 or not vec_lsi2:
            self.sendLine(str(0))
            return

        #print "LSA vector1 :"
        #print vec_lsi1
        #print "LSA vector2 : "
        #print vec_lsi2
        try:
            cossim = numpy.dot(matutils.unitvec(numpy.array([ x[1] for x in vec_lsi1])),
                               matutils.unitvec(numpy.array([ x[1] for x in vec_lsi2])) )
        except:
            print "dot product faalt"
            cossim = 0
            raise
        self.sendLine(str(cossim))
Example #3
0
def smartirs_normalize(x, norm_scheme, return_norm=False):
    """Normalize a vector using the normalization scheme specified in `norm_scheme`.

    Parameters
    ----------
    x : numpy.ndarray
        Input array
    norm_scheme : {'n', 'c'}
        Normalizing function to use:
        `n`: no normalization
        `c`: unit L2 norm (scale `x` to unit euclidean length)
    return_norm : bool, optional
        Return the length of `x` as well?

    Returns
    -------
    numpy.ndarray
        Normalized array.
    float (only if return_norm is set)
        L2 norm of `x`.

    """
    if norm_scheme == "n":
        if return_norm:
            _, length = matutils.unitvec(x, return_norm=return_norm)
            return x, length
        else:
            return x
    elif norm_scheme == "c":
        return matutils.unitvec(x, return_norm=return_norm)
Example #4
0
    def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
Example #5
0
    def similarity(self, d1, d2):
        """
        Compute cosine similarity between two docvecs in the trained set, specified by int index or
        string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)

        """
        return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
Example #6
0
def main():
    # model parameters, taken from Kim et al. & Kulkarni et al.
    ALPHA = 0.01
    NET_SIZE = 200

    if len(sys.argv) < 11:
        raise Exception("""Provide 5+ arguments:\n\t1,path to save models\n\t2,path to corpora
            \t3,number of worker processes\n\t4,number of max. epochs\n\t5, minimum count
            \t6, hierarchic (0/1)\n\t7,neg sampling (0-20)\n\t8,downsampling (0-0.00001)
            \n\t9,max distance for convergence as exponent (e.g., 2 corresponding to 10^-2), use 0 to indicate no limit
            \n\t10+ files to train on (one model per file)""")
    model_path = sys.argv[1]
    corpus_path = sys.argv[2]
    workers = int(sys.argv[3])
    epochs = int(sys.argv[4])
    min_count = int(sys.argv[5])
    hs = int(sys.argv[6])
    negative = int(sys.argv[7])
    sample = float(sys.argv[8])
    if sys.argv[9] == "0":
        max_dist = None
    else:
        max_dist = 1 - 10**(-1 * float(sys.argv[9]))
    files = sys.argv[10:]

    if not os.path.exists(model_path):
        os.makedirs(model_path)
    old_model = None
    for f in files:
        if not os.path.exists(os.path.join(corpus_path, f)):
            logging.info("skipping %s", f)
            continue
        logging.info("processing %s", f)
        model = gensim.models.Word2Vec(
            size=NET_SIZE, window=4, min_count=min_count, workers=workers, alpha=ALPHA, sg=1,
            hs=hs, negative=negative, sample=sample)  # skip-gram on!
        corpus = Corpus(f, corpus_path)

        if old_model:
            update_vocab(corpus, old_model, model)
        else:
            model.build_vocab(corpus)

        epoch = 0
        dist = 0
        while epoch < epochs and (max_dist == None or dist < max_dist):
            epoch += 1
            if epoch > 1:
                old_syn0 = copy(model.syn0)
            model.train(corpus)
            if epoch > 1 and not max_dist == None:
                dist = sum([dot(unitvec(model.syn0[i]), unitvec(
                    old_syn0[i])) for i in range(len(model.vocab))]) / len(model.vocab)
        old_model = model

        fname = os.path.join(model_path, "model" + f)
        fvocab = os.path.join(model_path, "vocab" + f)
        model.save_word2vec_format(fname, fvocab=fvocab, binary=True)
        logging.info("finished after %s epochs", epoch)
Example #7
0
def cosine_similarity(word1, word2):

    global word_vector

    if word1.lower() in word_vector.keys() and word2.lower() in word_vector.keys():
        return dot(matutils.unitvec(word_vector[word1.lower()]), matutils.unitvec(word_vector[word2.lower()]))
    else:
        return 0
Example #8
0
def calc_w2v_similarity(words, use_ic=False):
    words1 = words[0]
    words2 = words[1]

    vec1 = sentence_vec(words1, use_ic)
    vec2 = sentence_vec(words2, use_ic)

    return [dot(matutils.unitvec(array(vec1)), matutils.unitvec(array(vec2)))]
Example #9
0
    def n_similarity(self, ds1, ds2):
        """
        Compute cosine similarity between two sets of docvecs from the trained set, specified by int
        index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)

        """
        v1 = [self[doc] for doc in ds1]
        v2 = [self[doc] for doc in ds2]
        return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
Example #10
0
 def getUVectors(self, toks):
     '''
     Token Unit Vectors
     '''
     if isinstance(toks, basestring):
         uv = mu.unitvec(self.model[toks])
     else:
         uv = [mu.unitvec(self.model[tok]) for tok in toks]
     return uv    
def puebaSimpleCosenos():
	model = Doc2Vec.load('./imdb_dm.d2v')

	source = 'data/trainneg.txt'
	generador = GeneraVectores(model)
	vecs = generador.getVecsFromFile(source)

	print "coseno primer vector, trainneg"
	print dot(matutils.unitvec(vecs[0]), matutils.unitvec(model.docvecs["TRAIN_NEG_0"]))
Example #12
0
    def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Compute cosine similarity between two post-bulk out of training documents.

        Document should be a list of (word) tokens.
        """
        d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps)
        d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps)
        return dot(matutils.unitvec(d1), matutils.unitvec(d2))
Example #13
0
 def similarity(self, word_a, word_b):
     try:
         a_vector = self.embeddings[word_a]
         b_vector = self.embeddings[word_b]
         diff = dot(matutils.unitvec(a_vector), matutils.unitvec(b_vector))
         return diff
     except KeyError:
         #logger.debug("'%s' or '%s' don't have a word vector" % (word_a.encode("utf-8"),
         #                                                        word_b.encode("utf-8")))
         return 0.0 if word_a != word_b else 1.0
Example #14
0
 def similarity(self, word_a, word_b):
     try:
         a_vector = self.embeddings[word_a]
         b_vector = self.embeddings[word_b]
         diff = dot(matutils.unitvec(a_vector), matutils.unitvec(b_vector))
         return diff
     except KeyError:
         logger.debug("'%s' or '%s' don't have a word vector" % (word_a,
                                                                 word_b))
         return 0.0
Example #15
0
def py_vq2(obs, code_book, check_finite=True):
    """2nd Python version of vq algorithm.

    The algorithm simply computes the euclidian distance between each
    observation and every frame in the code_book/

    Parameters
    ----------
    obs : ndarray
        Expect a rank 2 array. Each row is one observation.
    code_book : ndarray
        Code book to use. Same format than obs. Should have same number of
        features (eg columns) than obs.
    check_finite : bool, optional
        Whether to check that the input matrices contain only finite numbers.
        Disabling may give a performance gain, but may result in problems
        (crashes, non-termination) if the inputs do contain infinities or NaNs.
        Default: True

    Returns
    -------
    code : ndarray
        code[i] gives the label of the ith obversation, that its code is
        code_book[code[i]].
    mind_dist : ndarray
        min_dist[i] gives the distance between the ith observation and its
        corresponding code.

    Notes
    -----
    This could be faster when number of codebooks is small, but it
    becomes a real memory hog when codebook is large. It requires
    N by M by O storage where N=number of obs, M = number of
    features, and O = number of codes.

    """
    obs = _asarray_validated(obs, check_finite=check_finite)
    code_book = _asarray_validated(code_book, check_finite=check_finite)
    d = shape(obs)[1]

    # code books and observations should have same number of features
    if not d == code_book.shape[1]:
        raise ValueError("""
            code book(%d) and obs(%d) should have the same
            number of features (eg columns)""" % (code_book.shape[1], d))

    #diff = obs[newaxis, :, :] - code_book[:,newaxis,:]
    #dist = sqrt(np.sum(diff * diff, -1))
    dist = dot(matutils.unitvec(obs[newaxis, :, :]), matutils.unitvec(code_book[:,newaxis,:]))
    code = argmin(dist, 0)
    min_dist = minimum.reduce(dist, 0)
    # The next line I think is equivalent and should be faster than the one
    # above, but in practice didn't seem to make much difference:
    # min_dist = choose(code,dist)
    return code, min_dist
Example #16
0
def main():
    """
    Training follows procedure described in Kim et al. (2014), cf. https://www.aclweb.org/anthology/W/W14/W14-2517.pdf
    """
    ALPHA = 0.01
    NET_SIZE = 200

    if len(sys.argv) < 6:
        raise Exception("""Provide 5+ arguments:\n\t1,path to save models\n\t2,path to corpora
            \t3,number of worker processes\n\t4,number of max. epochs\n\t5, minimum count
            \t6, hierarchic (0/1)\n\t7,neg sampling (0-20)\n\t8,downsampling (0-0.00001)
            9+ files to train on (one model per file)""")
    model_path = sys.argv[1]
    corpus_path = sys.argv[2]
    workers = int(sys.argv[3])
    epochs = int(sys.argv[4])
    min_count = int(sys.argv[5])
    hs = int(sys.argv[6])
    negative = int(sys.argv[7])
    sample = float(sys.argv[8])
    files = sys.argv[9:]

    if not os.path.exists(model_path):
        os.makedirs(model_path)
    old_model = None
    for f in files:
        if not os.path.exists(os.path.join(corpus_path, f)):
            logging.info("skipping %s", f)
            continue
        logging.info("processing %s", f)
        model = gensim.models.Word2Vec(
            size=NET_SIZE, window=4, min_count=min_count, workers=workers, alpha=ALPHA, sg=1,
            hs=hs, negative=negative, sample=sample)
        corpus = Corpus(f, corpus_path)

        if old_model:
            update_vocab(corpus, old_model, model)
        else:
            model.build_vocab(corpus)

        # training to convergence
        epoch = 0
        dist = 0
        while epoch < epochs and dist < 0.99:
            epoch += 1
            if epoch > 1:
                old_syn0 = copy(model.syn0)
            model.train(corpus)
            if epoch > 1:
                dist = sum([dot(unitvec(model.syn0[i]), unitvec(
                    old_syn0[i])) for i in range(len(model.vocab))]) / len(model.vocab)
        old_model = model
        model.save(os.path.join(model_path, "model" + f))
        logging.info("finished after %s epochs", epoch)
    def __iter__(self):
        textual_lines = FileIOManager.read_textual_file()
        visual_file = open(FileIOManager.images_features_path, 'r')
        visual_file.readline()
        number_of_lines = 0
        for textual_line in textual_lines:
            number_of_lines += 1
            if self.limited_length is not None and number_of_lines > self.limited_length:
                break

            corpus_line_dict = dict()
            line_words = textual_line.split()
            textual_img_id = line_words[0]
            number_of_features = int(line_words[1])
            line_words = line_words[2:]
            for j in range(0, number_of_features*2, 2):
                word = self.dictionary.processWord(line_words[j].decode('utf-8'))
                # Normalize weight
                weight = float(line_words[j + 1]) / 100000
                if word not in self.dictionary.word2id:
                    continue
                # Get word id
                word_id = self.dictionary.word2id[word]
                if word_id not in corpus_line_dict:
                    corpus_line_dict[word_id] = weight
                else:
                    corpus_line_dict[word_id] += weight

            # Create array of tuples (word_id, weight) from dictionary
            corpus_line = []
            for key, value in corpus_line_dict.iteritems():
                corpus_line.append( (key, value) )

            # Normalize to unit vector
            corpus_line = matutils.unitvec(corpus_line)

            # Search for training images only for corresponding img
            visual_line = visual_file.readline().split()
            image_id = visual_line[0]
            while image_id != textual_img_id:
                visual_line = visual_file.readline().split()
                image_id = visual_line[0]

            # Append visual features
            corpus_line = corpus_line + utils.generate_corpus_for_image(visual_line[1:], self.dictionary.features_names2id)

            # Normalize to unit vector
            corpus_line = matutils.unitvec(corpus_line)

            yield corpus_line
    def similarity(self, w1, w2):
        """
        Compute cosine similarity between two words.

        Example::

          >>> trained_model.similarity('woman', 'man')
          0.73723527

          >>> trained_model.similarity('woman', 'woman')
          1.0

        """
        return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
Example #19
0
    def add_documents(self, corpus):
        """Extend the index with new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in BoW format.

        Notes
        -----
        Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them
        (or when a query is issued).

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora.textcorpus import TextCorpus
            >>> from gensim.test.utils import datapath, get_tmpfile
            >>> from gensim.similarities import Similarity
            >>>
            >>> corpus = TextCorpus(datapath('testcorpus.mm'))
            >>> index_temp = get_tmpfile("index")
            >>> index = Similarity(index_temp, corpus, num_features=400)  # create index
            >>>
            >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt'))
            >>> index.add_documents(one_more_corpus)  # add more documents in corpus

        """
        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
Example #20
0
    def __getitem__(self, query):
        """Get similarities of the given document or corpus against this index.

        Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally.

        Notes
        -----
        Passing an entire corpus as `query` can be more efficient than passing its documents one after another,
        because it will issue queries in batches internally.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)}
            Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.

        Returns
        -------
        {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
            Similarities given document or corpus and objects corpus, depends on `query`.

        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if not matutils.ismatrix(query):
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if maintain_sparsity is True, result is scipy sparse. Sort, clip the
        # topn and return as a scipy sparse matrix.
        if getattr(self, 'maintain_sparsity', False):
            return matutils.scipy2scipy_clipped(result, self.num_best)

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best)
Example #21
0
    def __getitem__(self, query):
        """Get access to similarities of document/corpus `query` to all documents in the corpus.

        Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities`


        Notes
        -----
        Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way.

        Parameters
        ----------
        query : {list of (int, int), iterable of list of (int, int)}
            Document or corpus in BoW format.

        Returns
        -------
        {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
            Similarities given document or corpus and objects corpus, depends on `query`.

        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if not matutils.ismatrix(query):
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if maintain_sparsity is True, result is scipy sparse. Sort, clip the
        # topn and return as a scipy sparse matrix.
        if getattr(self, 'maintain_sparsity', False):
            return matutils.scipy2scipy_clipped(result, self.num_best)

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best)
Example #22
0
def reject_words_1(A, B, model = model):
  '''Takes two **LIST OF WORDS** and
  returns most_similar for word A, while rejecting words with meanings closer to B.
  Seems to work better than just giving in negative words.
  ''' 
  in_words = A+B
  basic_word = [model[each] for each in A]
  reject_word = [model[each] for each in B]
  basic_mean = matutils.unitvec(array(basic_word).mean(axis=0)).astype(REAL)
  reject_mean = matutils.unitvec(array(reject_word).mean(axis=0)).astype(REAL)
  r = reject(basic_mean, reject_mean)
  dists = np.linalg.linalg.dot(model.syn0norm, r)
  best  = matutils.argsort(dists, topn = 500, reverse = True)
  result = [(model.index2word[sim], float(dists[sim])) for sim in best if model.index2word[sim] not in in_words]
  return result
def wordInfluenceOnTopics(model, noOfWords = 25):
    with open ('../Data/topic_words.txt', 'w') as fout:
        for t in range(model.K):
            fout.write ('================ TOPIC: %s ==============\n'% t)
            pq = PriorityQueue()
            for v in range(len(model.vocab)):
                word = model.index2word[v]
                if(('SENT' not in word) and ('TOPIC' not in word)):
                    vec_word = model.word_impact[v][t]
                    similarity = dot(matutils.unitvec(vec_word), matutils.unitvec(model['TOPIC_'+str(t)]))
                    pq.put((similarity, word))
            for i in range(noOfWords):
#                print pq.get()
                fout.write(str(pq.get()))
                fout.write('\n')
Example #24
0
    def find_instances(self, patterns, instances, child_conn):
        updated_patterns = list()
        candidate_tuples = list()
        while True:
            try:
                t = instances.get_nowait()
                if instances.qsize() % 500 == 0:
                    sys.stdout.write(
                        str(multiprocessing.current_process()) +
                        " Instances to process: " +
                        str(instances.qsize())+'\n')
                    sys.stdout.flush()

                # measure similarity towards every extraction pattern
                max_similarity = 0
                pattern_best = None
                for p in patterns:
                    good = 0
                    bad = 0
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        for p_bet_v in list(p.bet_uniques_vectors):
                            if t.bet_vector is not None and p_bet_v is not None:
                                score = dot(
                                    matutils.unitvec(t.bet_vector),
                                    matutils.unitvec(asarray(p_bet_v))
                                )
                                if score >= self.config.threshold_similarity:
                                    good += 1
                                else:
                                    bad += 1

                    if good > bad:
                        p.update_selectivity(t, self.config)
                        if score > max_similarity:
                            max_similarity = score
                            pattern_best = p

                # if its above a threshold associated the pattern with it
                if max_similarity >= self.config.threshold_similarity:
                    candidate_tuples.append((t, pattern_best, max_similarity))

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                for p in patterns:
                    updated_patterns.append(p)
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, updated_patterns, candidate_tuples))
                break
Example #25
0
 def train(self, read_article_ids = None, unread_article_ids = None):
     #Load user feedback if needed
     if read_article_ids is None:
         read_article_ids = (r.article.id for r in ReadArticleFeedback.objects(user_id = self.user.id).only("article"))
         
     user_feedback = Article.objects(id__in = read_article_ids)
     
     #TODO: cluster feedback articles and save more than one profile
     
     num_loaded_articles = 0
     centroid = numpy.zeros(self.num_features_, dtype=numpy.float32)
     
     for article in user_feedback:
         try:
             article_features_as_full_vec = self.get_features(article)
         except Exception as inst:
             logger.error("Could not get features for article %s: %s" %
                          (article.id, inst))
             continue
         
         #do we need this?
         tmp_doc = matutils.unitvec(article_features_as_full_vec)
         
         #add up tmp_doc
         centroid = numpy.add(centroid, tmp_doc)
         num_loaded_articles += 1 
         
     #average each element
     if num_loaded_articles != 0:
         centroid = centroid / num_loaded_articles
         
     centroid = matutils.full2sparse(centroid)
     
     #set user model data
     self.user_model_features = [centroid]
Example #26
0
def mean_word_vecs(model, positive=[], negative=[], skip_unknown=False):
    '''
    gensim.Word2vecのモデルから、単語を足しあわせたベクトルを計算する。
    this code is based on gensim.Word2vec.most_simialr
    どの単語も辞書にない場合はNoneを返す。
    '''
    model.init_sims()

    # add weights for each word, if not already present; default to 1.0 for
    # positive and -1.0 for negative words
    positive = [(word, 1.0) for word in positive]
    negative = [(word, -1.0) for word in negative]

    # compute the weighted average of all words
    all_words, mean = set(), []
    for word, weight in positive + negative:
        if isinstance(word, numpy.ndarray):
            mean.append(weight * word)
        elif word in model.vocab:
            mean.append(weight * model.syn0norm[model.vocab[word].index])
            #all_words.add(model.vocab[word].index)
        elif not skip_unknown:
            words = tools.word_segmenter_ja(word, np=False)
            words = [w for w in words if len(w.strip()) > 0]
            mean_ = mean_word_vecs(model, positive=words, skip_unknown=True)
            if mean_ is not None:
                mean.append(weight * mean_)
            #raise KeyError("word '%s' not in vocabulary" % word)

    if not mean:
        #raise ValueError("cannot compute similarity with no input")
        return None

    mean = matutils.unitvec(numpy.array(mean).mean(axis=0)).astype(numpy.float32)
    return mean
Example #27
0
    def most_similar(self, positive=[], negative=[], topn=10):


        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [(word, 1.0) if isinstance(word, string_types + (ndarray,))
                                else word for word in positive]
        negative = [(word, -1.0) if isinstance(word, string_types + (ndarray,))
                                 else word for word in negative]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if isinstance(word, ndarray):
                mean.append(weight * word)
            elif word in self.vocab:
                mean.append(weight * self.syn0norm[self.vocab[word].index])
                all_words.add(self.vocab[word].index)
            else:
                raise KeyError("word '%s' not in vocabulary" % word)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.syn0norm, mean)
        if not topn:
            return dists
        best = argsort(dists)[::-1][:topn + len(all_words)]
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim]), self.syn0[sim]) for sim in best if sim not in all_words]
        return result[:topn]
Example #28
0
    def __getitem__(self, bow, eps=1e-12):
        """
        Return esa representation of the input vector and/or corpus.
        
        bow should already be weights, e.g. with TF-IDF
        """
        # if the input vector is in fact a corpus, return a transformed corpus 
        # as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        #use corpus as interpreter matrix
        #simply multiply feature vector of input with corpus matrix
        #to get the weight of the concept
        vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus)

        #normalize
        vector = matutils.unitvec(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(concept_id, weight)
                  for concept_id, weight
                  in enumerate(vector)
                  if abs(weight) > eps]
        return vector
    def __getitem__(self, bow):
        """Get log entropy representation of the input vector and/or corpus.

        Parameters
        ----------
        bow : list of (int, int)
            Document in BoW format.

        Returns
        -------
        list of (int, float)
            Log-entropy vector for passed `bow`.

        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [
            (term_id, math.log(tf + 1) * self.entr.get(term_id))
            for term_id, tf in bow
            if term_id in self.entr
        ]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector
Example #30
0
    def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
                 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
                       matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(
                corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                dtype=dtype, printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr()  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Example #31
0
    def word_averaging(self, words):
        vecs = []

        for word in words:
            if isinstance(word, np.ndarray):
                vecs.append(word)
            elif word in self.wv.wv.vocab:
                id = self.wv.wv.vocab[word].index
                vecs.append(self.wv.wv.syn0norm[id])

        if not vecs:
            logging.getLogger(self.__class__.__name__).warning(
                "cannot compute similarity : %s", words)
            # FIXME: remove these examples in pre-processing
            return np.zeros(self.wv.layer1_size, )

        vec = np.array(vecs).mean(axis=0)
        vec = unitvec(vec).astype(np.float32)
        return vec
Example #32
0
def combTest(model, w1, w2, pDict):
    rstStandard = model.similarity(w1, w2) #标准答案

    vec1c = model[w1] if len(w1) < 5 else vecMean(model, [i for i in w1 if i in model])
    vec2c = model[w2] if len(w2) < 5 else vecMean(model, [i for i in w2 if i in model])
    rstc = np.dot(matutils.unitvec(vec1c), matutils.unitvec(vec2c))

    vec1s = model[w1] if len(w1) < 5 else vecMean(model, pDict[w1])
    vec2s = model[w2] if len(w2) < 5 else vecMean(model, pDict[w2])
    rsts = np.dot(matutils.unitvec(vec1s), matutils.unitvec(vec2s))

    # 下面是两个K-Means
    vec1kc = model[w1] if len(w1) < 5 else vecKmean(model, w1, [i for i in w1 if i in model])
    vec2kc = model[w2] if len(w2) < 5 else vecKmean(model, w2, [i for i in w2 if i in model])
    rstkc = np.dot(matutils.unitvec(vec1kc), matutils.unitvec(vec2kc))

    vec1ks = model[w1] if len(w1) < 5 else vecKmean(model, w1, pDict[w1])
    vec2ks = model[w2] if len(w2) < 5 else vecKmean(model, w2, pDict[w2])
    rstks = np.dot(matutils.unitvec(vec1ks), matutils.unitvec(vec2ks))

    return rstStandard, rstc, rsts, rstkc, rstks
Example #33
0
def _most_similar(self: WordEmbeddingsKeyedVectors, author, input_word):
    topn = 10

    positive = [input_word]

    self.init_sims()

    # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
    positive = [
        (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
        for word in positive
    ]

    # compute the weighted average of all words
    all_words, mean = set(), []
    for word, weight in positive:
        if isinstance(word, ndarray):
            mean.append(weight * word)
        else:
            mean.append(weight * self.word_vec(word, use_norm=True))
            index = encode_adj(word, author)

            if index >= 0:
                all_words.add(index)
    if not mean:
        raise ValueError("cannot compute similarity with no input")
    mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

    limited = self.vectors_norm
    dists = dot(limited, mean)

    if not topn:
        return dists
    best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
    # ignore (don't return) words from the input
    result = [(sim, float(dists[sim])) for sim in best if sim not in all_words]

    adj_res = [
        (decode(r[0], author), r[1]) for r in result
        if encode_adj(decode(r[0], author), author) >= 0
    ]

    return adj_res[:topn]
Example #34
0
def word_averaging(wv, words):
    all_words, mean = set(), []

    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)
            print "biswa"

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.layer_size,)

    mean = unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    print mean
    return mean
Example #35
0
    def _get_jieba_array(self, words):
        words = char_cleaner(words)
        seg_cut = jieba.lcut(words)
        seg_cut = char_list_cheaner(seg_cut)

        w2v_array = list()
        for word in seg_cut:
            try:
                similar_list = self.w2v_model[word]
                w2v_array.append(similar_list)
            except KeyError:
                continue

        if not w2v_array:
            w2v_array = [None] * self.size
        else:
            w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0))

        return w2v_array
Example #36
0
    def most_similar_paragraph(self, positive=[], negative=[], topn=10):
        """
		Find the top-N most similar paragraphs.

		"""
        self.init_sims()

        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each paragraph, if not already present; default to 1.0 for positive and -1.0 for negative paragraphs
        positive = [(paragraph, 1.0) if isinstance(paragraph, string_types +
                                                   (ndarray, )) else paragraph
                    for paragraph in positive]
        negative = [(paragraph, -1.0) if isinstance(paragraph, string_types +
                                                    (ndarray, )) else paragraph
                    for paragraph in negative]

        # compute the weighted average of all words
        all_paragraphs, mean = set(), []
        for paragraph, weight in positive + negative:
            if isinstance(paragraph, ndarray):
                mean.append(weight * paragraph)
            elif paragraph in self.paragraph_vocab:
                mean.append(weight * self.synparagraphnorm[
                    self.paragraph_vocab[paragraph].index])
                all_paragraphs.add(self.paragraph_vocab[paragraph].index)
            else:
                raise KeyError("paragraph '%s' not in vocabulary" % paragraph)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.synparagraphnorm, mean)
        if not topn:
            return dists
        best = argsort(dists)[::-1][:topn + len(all_paragraphs)]
        # ignore (don't return) words from the input
        result = [(self.index2paragraph[sim], float(dists[sim]), sim)
                  for sim in best if sim not in all_paragraphs]
        return result[:topn]
Example #37
0
    def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        if corpus_len is None:
            corpus_len = len(corpus)

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError(
                    "cannot index a corpus with zero features (you must specify either `num_features` "
                    "or a non-empty corpus in the constructor)"
                )
            logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
            self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len)
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector
Example #38
0
def get_elmo_vector(sess, texts, batcher, sentence_character_ids, elmo_sentence_input, nrs):
    vectors = []

    # Create batches of data.
    sentence_ids = batcher.batch_sentences(texts)
    print('Sentences in this chunk:', len(texts), file=sys.stderr)

    # Compute ELMo representations.
    elmo_sentence_input_ = sess.run(elmo_sentence_input['weighted_op'],
                                    feed_dict={sentence_character_ids: sentence_ids})
    print('ELMo sentence input shape:', elmo_sentence_input_.shape, file=sys.stderr)

    for sentence, nr in zip(range(len(texts)), nrs):
        # query_word = texts[sentence][nr]
        # print(texts[sentence])
        query_vec = elmo_sentence_input_[sentence, nr, :]
        query_vec = unitvec(query_vec)
        # print('Vector shape:', query_vec.shape)
        vectors.append(query_vec)
    return vectors
Example #39
0
    def doesnt_match(self, words):
        """
        Which word from the given list doesn't go with the others?

        Example::

          >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
          'cereal'

        """
        self.init_sims()

        words = [word for word in words if word in self.vocab]  # filter out OOV words
        logger.debug("using words %s" % words)
        if not words:
            raise ValueError("cannot select a word from an empty list")
        vectors = vstack(self.syn0norm[self.vocab[word].index] for word in words).astype(REAL)
        mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
        dists = dot(vectors, mean)
        return sorted(zip(dists, words))[0][1]
Example #40
0
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
    """
      最相似的句子,句向量与矩阵点乘
    :param vec: 
    :param matrix: 
    :param keys: 
    :param topn: 
    :return: 
    """
    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
    return index_score
Example #41
0
def generate_corpus_for_image(features, features2id):
    '''
    Create sparse vector (feature_id, weight) from visual features.
    :param features:
    :param features2id:
    :return:
    '''
    image_corpus_line = []
    values = []
    for i in range(0, 4096):
        feature_value = float(features[i])
        if feature_value != 0:
            values.append(feature_value)
            image_corpus_line.append((features2id[i], feature_value))
    # Get only X top significant elements.
    image_corpus_line = [
        x for (y, x) in sorted(zip(values, image_corpus_line), reverse=True)
    ][0:number_of_elements]
    #image_corpus_line = sorted(enumerate(image_corpus_line), key=lambda item: item[1], reverse=True)[0:number_of_elements]
    return matutils.unitvec(image_corpus_line)
Example #42
0
def updated_normalize(x, n_n):
    """Normalizes the final tf-idf value according to the value of `n_n`.

    Parameters
    ----------
    x : numpy.ndarray
        Input array
    n_n : {'n', 'c'}
        Parameter that decides the normalizing function to be used.

    Returns
    -------
    numpy.ndarray
        Normalized array.

    """
    if n_n == "n":
        return x
    elif n_n == "c":
        return matutils.unitvec(x)
Example #43
0
    def doesnt_match(self, docs):
        """
        Which doc from the given list doesn't go with the others?

        (TODO: Accept vectors of out-of-training-set docs, as if from inference.)

        """
        self.init_sims()

        docs = [
            doc for doc in docs if doc in self.doctags or 0 <= doc < self.count
        ]  # filter out unknowns
        logger.debug("using docs %s" % docs)
        if not docs:
            raise ValueError("cannot select a doc from an empty list")
        vectors = vstack(self.doctag_syn0norm[self._int_index(doc)]
                         for doc in docs).astype(REAL)
        mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
        dists = dot(vectors, mean)
        return sorted(zip(dists, docs))[0][1]
def get_w2v_vectors(text, merge_vectors=False):
    '''
    Translates text into vector using word2vec.

    Args:
        text: str
        merge_vectors: bool, return sentence by sentence vectors or their mean

    Returns:
        numpy.ndarray (if merge_vectors)
        OR dict, where key (str) is a sentence, value (numpy.ndarray) is a vector
    '''
    sentences = preprocessing(text, stopwords=stopwords.words('russian'))
    vectors = [get_w2v_vector(sentences)]
    if vectors == []:
        return None
    if merge_vectors:
        return matutils.unitvec(np.array(vectors).mean(axis=0)).astype(
            np.float32)
    return {sentence: vector for sentence, vector in zip(text, vectors)}
    def calc_norm(self, corpus):
        """Calculate the norm by calling :func:`~gensim.matutils.unitvec` with the norm parameter.

        Parameters
        ----------
        corpus : iterable of iterable of (int, number)
            Input corpus.

        """
        logger.info("Performing %s normalization...", self.norm)
        norms = []
        numnnz = 0
        docno = 0
        for bow in corpus:
            docno += 1
            numnnz += len(bow)
            norms.append(matutils.unitvec(bow, self.norm))
        self.num_docs = docno
        self.num_nnz = numnnz
        self.norms = norms
def raw2ppmi(cooccur, k_shift=1.0):
    # following lines a bit tedious, as we try to avoid making temporary copies of the (large) `cooccur` matrix
    marginal_word = cooccur.sum(axis=1)
    marginal_context = cooccur.sum(axis=0)
    cooccur /= marginal_word[:, None]  
    cooccur /= marginal_context 
    cooccur *= marginal_word.sum() 
    np.log(cooccur, out=cooccur) 

    #Shfiting PMI scores by log(k)
    cooccur -= np.log(k_shift) 

    #Clipping values to be non-negative
    cooccur.clip(0.0, out=cooccur)

    #Normalize PPMI vectors to unit length
    for i, vec in enumerate(cooccur):
        cooccur[i] = matutils.unitvec(vec)

    return cooccur
    def _to_csv(df, col, size):
        file_name = '{col}_w2v.csv'.format(col=col)
        file_path = os.path.join(TEMP_DATA_PATH, file_name)
        if os.path.exists(file_path):
            os.remove(file_path)

        columns = ['{}_w2v_{}'.format(col, i) for i in range(size)]
        none_index_set = set()

        with open(file_path, 'a', encoding='utf-8') as f:
            # write columns
            f.write(','.join(columns) + '\n')

            for idx, item in tqdm(df[col].items()):
                if item == 'null':
                    item_list = [''] * size
                    none_index_set.add(idx)
                elif not item:
                    item_list = [''] * size
                    none_index_set.add(idx)
                else:
                    seg_cut = jieba.lcut(item)
                    seg_cut = char_list_cheaner(seg_cut)

                    w2v_array = list()
                    for word in seg_cut:
                        try:
                            similar_list = w2v_model[word]
                            w2v_array.append(similar_list)
                        except KeyError:
                            pass

                    if not w2v_array:
                        item_list = [''] * size
                        none_index_set.add(idx)
                    else:
                        item_list = matutils.unitvec(np.array(w2v_array).mean(axis=0))

                f.write(','.join(map(str, item_list)) + '\n')

        return none_index_set
Example #48
0
def get_elmo_vector_average(sess, texts, batcher, sentence_character_ids, elmo_sentence_input):
    vectors = []

    # Create batches of data.
    sentence_ids = batcher.batch_sentences(texts)
    print('Sentences in this chunk:', len(texts), file=sys.stderr)

    # Compute ELMo representations.
    elmo_sentence_input_ = sess.run(elmo_sentence_input['weighted_op'],
                                    feed_dict={sentence_character_ids: sentence_ids})
    print('ELMo sentence input shape:', elmo_sentence_input_.shape, file=sys.stderr)

    for sentence in range(len(texts)):
        sent_vec = np.zeros((elmo_sentence_input_.shape[1], elmo_sentence_input_.shape[2]))
        for word_vec in enumerate(elmo_sentence_input_[sentence, :, :]):
            sent_vec[word_vec[0], :] = word_vec[1]
        semantic_fingerprint = np.sum(sent_vec, axis=0)
        semantic_fingerprint = np.divide(semantic_fingerprint, sent_vec.shape[0])
        query_vec = unitvec(semantic_fingerprint)
        vectors.append(query_vec)
    return vectors
Example #49
0
    def _get_jieba_array(self, words, size=300):
        '''
        对输入的word做结巴分词后获取对于的词向量,取平均后作为words的向量
        '''
        seg_cut = jieba.lcut(words)
        seg_cut = char_list_cheaner(seg_cut)

        w2v_array = list()
        for word in seg_cut:
            try:
                similar_list = self.w2v_model[word]
                w2v_array.append(similar_list)
            except KeyError:
                continue

        if not w2v_array:
            w2v_array = [None] * size
        else:
            w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0))

        return w2v_array
    def shift_clip_pmi(pmimtr, k_shift=1.0):
        """
        Turns a pmi matrix into a PPMI matrix by setting all negative values to 0 and then shifting by a factor of
        -log(k).

        :param pmimtr: The matrix of PMI values.
        :param k_shift: The shift factor.
        :return: A PPMI matrix.
        """

        logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, ))
        pmimtr -= np.log(k_shift)  # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k)

        logger.info("clipping PMI scores to be non-negative PPMI")
        pmimtr.clip(0.0, out=pmimtr)  # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k))

        logger.info("normalizing PPMI word vectors to unit length")
        for i, vec in enumerate(pmimtr):
            pmimtr[i] = matutils.unitvec(vec)

        return matutils.corpus2csc(matutils.Dense2Corpus(pmimtr, documents_columns=False)).T
Example #51
0
    def mostSimilarSent(self, sent, query, allDoc, topn):

        words2 = query.split()

        try:
            words2.remove(u'\ufeff')
        except ValueError:
            words2 = words2

        v2 = numpy.array([self[word] for word in words2], dtype=object)
        mean = matutils.unitvec(array(v2).mean(axis=0))

        print "starting search dist"
        dists = dot(allDoc[0:None], mean)
        best = matutils.argsort(dists, topn, reverse=True)
        print "done!"
        result = []
        for index in best:
            result.append(sent[index])

        return result
def word_averaging(wv, words):
    """Calculate average word vectors.

    Args:
        wv: The keyed vectors instance to use to get word vectors as :class:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors`.
        words: The words to transform into vectors as :class:`list` of :class:`str`.

    Returns:
        The averaged vector as :class:`list` of :class:`float`.
    """
    all_words, mean = set(), []

    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.word_vec(word, use_norm=True))
            all_words.add(wv.vocab[word].index)

    mean = unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean
    def most_similar(self, positive=[], negative=[], topn=10):

        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [
            (word,
             1.0) if isinstance(word, string_types + (ndarray, )) else word
            for word in positive
        ]
        negative = [
            (word,
             -1.0) if isinstance(word, string_types + (ndarray, )) else word
            for word in negative
        ]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if isinstance(word, ndarray):
                mean.append(weight * word)
            elif word in self.vocab:
                mean.append(weight * self.syn0norm[self.vocab[word].index])
                all_words.add(self.vocab[word].index)
            else:
                raise KeyError("word '%s' not in vocabulary" % word)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.syn0norm, mean)
        if not topn:
            return dists
        best = argsort(dists)[::-1][:topn + len(all_words)]
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim]), self.syn0[sim])
                  for sim in best if sim not in all_words]
        return result[:topn]
Example #54
0
    def similarity_3_contexts(self, p, t):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector),
                      matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector),
                      matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector),
                      matutils.unitvec(p.aft_vector))

        return self.config.alpha * bef + self.config.beta * bet + self.config.gamma * aft
Example #55
0
    def __init__(self,
                 corpus,
                 num_best=None,
                 chunksize=500,
                 dtype=numpy.float32,
                 num_terms=None,
                 num_docs=None,
                 num_nnz=None):
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray)
                       else matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(corpus,
                                             num_terms=num_terms,
                                             num_docs=num_docs,
                                             num_nnz=num_nnz,
                                             dtype=dtype,
                                             printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr(
            )  # currently no-op, CSC.T is already CSR
            logger.info("created %r" % self.index)
def get_w2v_vector(sentence):
    '''
    Translates sentence into vector using word2vec.

    Args:
        sentence: list of strings

    Returns:
        numpy.ndarray
    '''
    all_words, mean = set(), []

    for word in sentence:
        if word in w2v_model.wv.vocab:
            mean.append(w2v_model.wv.word_vec(word))
            all_words.add(w2v_model.wv.vocab[word].index)

    if mean == []:
        return np.zeros(w2v_model.layer1_size, )

    mean = matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean
Example #57
0
    def doesnt_match(self, words):
        """
        Which word from the given list doesn't go with the others?

        Example::

          >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
          'cereal'

        """
        self.init_sims()

        used_words = [word for word in words if word in self]
        if len(used_words) != len(words):
            ignored_words = set(words) - set(used_words)
            logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words)
        if not used_words:
            raise ValueError("cannot select a word from an empty list")
        vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
        mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
        dists = dot(vectors, mean)
        return sorted(zip(dists, used_words))[0][1]
Example #58
0
    def get_synsets_of_rule_parse(self, dp, use_offset=True, convert=False):
        try:
            context = ' '.join(dp['captions'])
        except TypeError:  # Nan is in list
            context = ' '.join(
                [item for item in dp['captions'] if isinstance(item, str)])
        tokens = self.tokenize(context)
        vecs = self.get_vecs_for_BOW(context)
        context_vec = normalize_vec(array(vecs).mean(axis=0))
        context_vec = matutils.unitvec(context_vec)
        pos_tagged_context_dict = {
            self.lemmatize(k): v
            for k, v in self.tagger.tag(tokens)
        }  # <token>: <pos>
        new_atoms = []
        unique_entities = set([x for atom in dp['atoms'] for x in atom])
        entity_id_dict = {}
        for entity in unique_entities:
            try:
                pt_pos = pos_tagged_context_dict[entity]
            except KeyError:
                pt_pos = self.tagger.tag([entity])[0][-1]
            pos = self.pos_map[pt_pos]
            synset = self.link_word_to_wn(entity,
                                          context_vec,
                                          context_as_vec=True,
                                          pos=pos)
            #synset = self.link_word_to_wn(entity,context,pos=pos)
            if synset is None: offset = None
            else: offset = synset.offset()
            if use_offset: entity_id_dict[entity] = offset
            else: entity_id_dict[entity] = synset
        for atom in dp['atoms']:
            new_atom = []
            for entity in atom:
                new_atom.append((entity, entity_id_dict[entity]))
            new_atoms.append(new_atom)

        return convert_logical_caption(new_atoms) if convert else new_atoms
Example #59
0
    def saveSentenceVect(self, sent, loc):
        #be patient! this operation should take a long time
        allDoc = []
        print "starting calculate vectors"
        for phrase, title in sent:
            s = phrase
            words1 = s.split()

            try:
                words1.remove(u'\ufeff')
            except ValueError:
                words1 = words1

            v1 = numpy.array([self[word] for word in words1], dtype=object)
            allDoc.append(matutils.unitvec(array(v1).mean(axis=0)))
        print "done!"

        #numpy.savez('obj/vect.npz', *allDoc)
        print "save to a file..."
        outfile = open(loc + ".pkl", "w")
        numpy.save(loc, allDoc)
        print "done!"
    def classify(self, instance):
        """Classify a text instance

        Returns:
            distribution: dict {class: possibility}
        """

        distribution = {}

        words = instance.text.split()
        test_vec = self.model.infer_vector(words, steps=self.infer_num_passes)
        test_vec = unitvec(test_vec)
        for class_value, training_instances in self.training_data.items():
            best_score = 0
            for training_instance in training_instances:
                score = np.dot(test_vec, training_instance)
                if score > best_score:
                    best_score = score

            distribution[class_value] = max(0, best_score)

        return self._normalize_distribution(distribution)