Exemple #1
 def text_similarity(self, t1, t2):
     if self._model == 'tfidf':
         t1_vec = matutils.any2sparse(self.text2model(t1))
         t2_vec = matutils.any2sparse(self.text2model(t2))
         return matutils.cossim(t1_vec, t2_vec)
         t1_vec = matutils.any2sparse(self.text2model(t1))
         t2_vec = matutils.any2sparse(self.text2model(t2))
         return matutils.cossim(t1_vec, t2_vec)
Exemple #3
def make_scores_for_sample():
    doc2vec_model = doc2vec.Doc2Vec.load('doc2vec_weigths')
    logging.info('doc2vec loaded')
    tfidf_unigram_model = TfidfModel.load('tfidf_unigram')
    logging.info('tfidf unigram loaded')
    tfidf_bigram_model = TfidfModel.load('tfidf_bigram')
    logging.info('tfidf bigram loaded')
    d1 = corpora.Dictionary.load('./dict_1.gensim')
    logging.info('dict1 loaded')
    d2 = corpora.Dictionary.load('./dict_2.gensim')
    logging.info('dict2 loaded')
    queries = pd.read_csv('./queries_norm.tsv', sep='\t', header=None, index_col=0)
    sample = pd.read_csv('./sample.csv', sep=',').sort_values(by=['DocumentId'])
    with open('./submission.csv', 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(['QueryId', 'DocumentId', 'Score'])
        for idx, row in tqdm(sample.iterrows()):
            query_id = row['QueryId']
            doc_id = row['DocumentId']
            doc2vec_score = doc2vec_model.docvecs.similarity('DOC_%d' % doc_id, 'QUERY_%d' % query_id)
            doc = get_doc(doc_id)
            query = str(queries.loc[query_id])
            doc_title = str(doc[1])
            doc_content = str(doc[2])

            doc_title_words = doc_title.split()
            doc_content_words = doc_content.split()
            query_words = query.split()

            doc_title_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_title_words[:-1], doc_title_words[1:]))))
            doc_content_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_content_words[:-1], doc_content_words[1:]))))
            query_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(query_words[:-1], query_words[1:]))))

            doc_title_words = d1.doc2bow(doc_title_words)
            doc_content_words = d1.doc2bow(doc_content_words)
            query_words = d1.doc2bow(query_words)

            doc_title_words = tfidf_unigram_model[doc_title_words]
            doc_content_words = tfidf_unigram_model[doc_content_words]
            query_words = tfidf_unigram_model[query_words]

            doc_title_bigrams = tfidf_bigram_model[doc_title_bigrams]
            doc_content_bigrams = tfidf_bigram_model[doc_content_bigrams]
            query_bigrams = tfidf_bigram_model[query_bigrams]

            tfidf_title_score_uni = matutils.cossim(doc_title_words, query_words)
            tfidf_content_score_uni = matutils.cossim(doc_content_words, query_words)
            tfidf_title_score_bi = matutils.cossim(doc_title_bigrams, query_bigrams)
            tfidf_content_score_bi = matutils.cossim(doc_content_bigrams, query_bigrams)

            score = (2 * tfidf_content_score_bi + 2 * tfidf_title_score_uni + tfidf_content_score_uni + 0.5 * doc2vec_score) / 5.5
            writer.writerow([query_id, doc_id, score])
Exemple #4
    def similarity(self, t, extraction_pattern):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None:
            bef = cossim(t.bef_vector, extraction_pattern.centroid_bef)

        if t.bet_vector is not None:
            bet = cossim(t.bet_vector, extraction_pattern.centroid_bet)

        if t.aft_vector is not None:
            aft = cossim(t.aft_vector, extraction_pattern.centroid_aft)

        return self.config.alpha * bef + self.config.beta * bet + self.config.gamma * aft
Exemple #5
    def similarity(self, t, extraction_pattern):

        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and extraction_pattern.centroid_bef is not None:
            bef = cossim(t.bef_vector, extraction_pattern.centroid_bef)

        if t.bet_vector is not None and extraction_pattern.centroid_bet is not None:
            bet = cossim(t.bet_vector, extraction_pattern.centroid_bet)

        if t.aft_vector is not None and extraction_pattern.centroid_aft is not None:
            aft = cossim(t.aft_vector, extraction_pattern.centroid_aft)

        return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
Exemple #6
def test(file_name):
    dictionary = corpora.Dictionary.load('./temp_tfidf/temp_dict')
    corpus = corpora.MmCorpus('./temp_tfidf/temp_mm')
    tfidf = models.TfidfModel.load('./temp_tfidf/tfidf_value')
    tags = load_tags('./temp_tfidf/tags')

    with open(file_name, 'r') as f, open('./result/tfidf_res', 'w') as outf:
        corpus_tfidf = tfidf[corpus]
        for line in f:
            items = line.decode('gbk').strip().split('\t')
            if len(items) != 2:
                raise Exception('error')
            qes = items[1].split(' ')
            new_vec = dictionary.doc2bow(qes)
            new_tfidf = tfidf[new_vec]
            h = []
            k = 10
            cnt = 0
            for dic in corpus_tfidf:
                s = matutils.cossim(new_tfidf, dic)
                heapq.heappush(h, (s, cnt))
                if len(h) > k:
                cnt += 1
            candidate = '&&'.join(['%s:%s' % (tags[i], s) for (s, i) in h])
            outf.write('%s\t%s\n' %
                       (items[0].encode('gbk'), candidate.encode('gbk')))
    def score(query, profile, data=None):
        if not len(profile.description):
            return [-1]

        vectorspace = VectorSpace([])

        tokenized_description = LowerTokenizer.tokenize(profile.description)
        description_vector = vectorspace.vector_for_document(

        ddg_description = DuckDuckDescription.query(query.lower())

        ddg_vector = []
        if ddg_description:
            ddg_text = ddg_description['description']['text']
            ddg_tokenized = LowerTokenizer.tokenize(ddg_text)
            ddg_vector = vectorspace.vector_for_document(

        if not len(ddg_vector):
            return [-1]

        return [cossim(description_vector, ddg_vector)]
    def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
        headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
        headline_tfidf = self.tfidf_model[headline_bow]

        scored_sentences = []
        'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
        #sentences = sentences.replace('\n', ' ')

        for sentence in self.tokenizer.tokenize(sentences):
            sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
            sim = cossim(headline_tfidf, sentence_tfidf)
            scored_sentences.append([sentence, sim])

        sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True)
        for sentence in sorted_sentences:
        ' return sorted_sentences '

        sentences_string = ""
        current_sentence_number = 0
        for sentence in sorted_sentences:
            current_sentence_number += 1
            sentences_string += sentence[0] + ' '
            if current_sentence_number == number_of_sentences:
        #print("Ranked: \n " + sentences_string)
        return sentences_string
def score(tweet, webpage):        
    lda = ldamodel.get_lda()    
    dictionary = ldamodel.get_dictionary()    
    tweet_vec = lda[dictionary.doc2bow(tweet['terms'])]  
    news_vec = cached_news_vector(webpage["content"].encode("utf-8"))
    score = matutils.cossim(news_vec, tweet_vec)                
    return score
Exemple #11
def ComputerSimilarity(model, corpus, blocks):
	print blocks;
	vectors = [];
	##There are 2 blocks, the one before the sentence and the one after

	for block in blocks:
		block_lda = model[corpus.dictionary.doc2bow(corpus.proc(block))];
		topics = np.asarray(block_lda);
		totalWeight = topics.sum(axis=0)[1]
		#Generate words, a dictionary that represents a vector that is the normalized combination of all the topics
		words = {};

		for row in topics:
			weight = row[1]/totalWeight;
			topicID = int(row[0]);
			term_list = model.get_topic_terms(topicID,T)
			for word_weight in term_list:
				word_id = word_weight[0];
				word_n = word_weight[1];
				if words.has_key(word_id):
					words[word_id] = words[word_id] + word_n * weight;
					words[word_id] = word_n * weight;
		aggregate_vector = words.items();

	dot_product = matutils.cossim(vectors[0],vectors[1])
	return dot_product
    def sim_matrix(self, _topics2cousines):
        Return two similarities matrix
            _topics2cousines    - Required  : list of topics vectors for list of cuisines (list of list of floats)

        _cuisine_matrix_e = []
        _cuisine_matrix_c = []

        for i, doc_a in enumerate(_topics2cousines):

            doc_a = doc_a[1]

            sim_vecs_e = []
            sim_vecs_c = []

            for j, doc_b in enumerate(_topics2cousines):
                doc_b = doc_b[1]

                w_sum_cs = matutils.cossim(doc_a, doc_b)
                w_sum_ed = 1 - self.euclidean_distance(list(doc_a), list(doc_b))

                if w_sum_ed < 0:
                    w_sum_ed = -1 * w_sum_ed


            _cuisine_matrix_e.append([_topics2cousines[i][0], sim_vecs_e])
            _cuisine_matrix_c.append([_topics2cousines[i][0], sim_vecs_c])

        return _cuisine_matrix_e, _cuisine_matrix_c
def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs):
    This function calculates the indirect cosine measure. Given context vectors
    _   _         _   _
    u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect
                                                                _     _
    cosine measure is computed as the cosine similarity between u and w.

    topics : Topics obtained from the trained topic model.
    segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
    per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
    measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
    gamma : Gamma value for computing W', W* vectors.
    num_docs : Total number of documents in corresponding corpus.
    if measure == 'nlr':
        measure = direct_confirmation_measure.normalized_log_ratio_measure
        raise ValueError("The direct confirmation measure you entered is not currently supported.")
    backtrack = {}
    s_cos_sim = []
    for top_words, s_i in zip(topics, segmented_topics):
        for w_prime, w_star in s_i:
            w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
            w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
            s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items())

    return s_cos_sim
    def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
        headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
        headline_tfidf = self.tfidf_model[headline_bow]

        scored_sentences = []
        'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
        #sentences = sentences.replace('\n', ' ')

        for sentence in self.tokenizer.tokenize(sentences):
            sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
            sim = cossim(headline_tfidf, sentence_tfidf)
            scored_sentences.append([sentence, sim])

        sorted_sentences = sorted(
            key=lambda scored_sentences: scored_sentences[1],
        for sentence in sorted_sentences:
        ' return sorted_sentences '

        sentences_string = ""
        current_sentence_number = 0
        for sentence in sorted_sentences:
            current_sentence_number += 1
            sentences_string += sentence[0] + ' '
            if current_sentence_number == number_of_sentences:
        #print("Ranked: \n " + sentences_string)
        return sentences_string
def calculateCoherence(config):

    lsaModel = models.LsiModel.load(config['LSA']['modelFileLocation'])

    dictionary = corpora.Dictionary.load(
        config['corpus']['corpusFolderLocation'] + 'corpus.dict')

    corpus = corpora.MmCorpus(config['corpus']['corpusFolderLocation'] +

    tfidf = models.TfidfModel(corpus)

    for dataset in config['corpus']['datasets']:

        for transcriptPath in os.listdir(dataset['path']):
            document = loadFileIntoList(dataset['path'] + "/" + transcriptPath,

            with open(
                    config['coherence']['outputFolderLocation'] +
                    transcriptPath + "_results_lsa.tsv", 'w') as outputFile:

                # write header
                    "coherence to previous sentence(s)\tpreprocessed sentence\tfull sentence\tcorresponding turn\n"

                lastSentencesLSAList = []
                for sentence in document:
                    if " ".join(sentence[0]):

                        sentenceBow = dictionary.doc2bow(sentence[0])

                        weightIndex = 1
                        simLSA = 0
                        weightNormalizer = 0
                        for el in lastSentencesLSAList:
                            simLSA += 1 / weightIndex * matutils.cossim(
                                lsaModel[tfidf[sentenceBow]], el)
                            weightNormalizer += 1 / weightIndex
                            weightIndex += 1

                        if weightNormalizer > 0:
                            simLSA /= weightNormalizer

                            0, lsaModel[tfidf[sentenceBow]])

                        if len(lastSentencesLSAList
                               ) > config['coherence']['slidingWindow']:
                            del lastSentencesLSAList[-1]

                        outputFile.write(str(simLSA) + "\t")
                        outputFile.write(" ".join(sentence[0]) + "\t")
                        outputFile.write(" ".join(sentence[1]) + "\t")
                        outputFile.write(sentence[2] + "\n")
Exemple #18
    def _update_pairwise_similarity(self, topic_id, content, date):
        updates similarity data within the corpus
        bow = self.dictionary.doc2bow(content)
        for tid, data in self.data.items():
            day_delta = (int(date) - int(data['date'])) / NUM_SECONDS_PER_DAY
            time_factor = math.pow(self.time_decay, day_delta)
            if tid == topic_id:
            bow1 = self.dictionary.doc2bow(data['body'])
            sim = matutils.cossim(bow, bow1)
            sim_1 = sim * min(1.0, 1 / time_factor)
            sim_2 = sim * min(1.0, time_factor)

            if self.irrelevant_thresh <= sim_1 <= self.duplicate_thresh:
                del_id = insert(data['sim_list'], topic_id, sim_1,
                if del_id is not None:
                    self.data[tid]['updated'] = True
                    if del_id != '':
                        remove(self.data[del_id]['appears_in'], tid)

            if self.irrelevant_thresh <= sim_2 <= self.duplicate_thresh:
                del_id = insert(self.data[topic_id]['sim_list'], tid, sim_2,
                if del_id is not None:
                    if del_id != '':
                        remove(self.data[del_id]['appears_in'], topic_id)
Exemple #19
def calculate_similarity(vec_sentence1, vec_sentence2, network_type):
    embeddings = ['d2v', 'gd2v', 'fastT', 'gloVe', 's2v']
    if network_type == 'tfidf':
        return matutils.cossim(vec_sentence1, vec_sentence2)
    #if network_type=='d2v' or network_type=='gd2v':
    if network_type in embeddings:
        return 1 - spatial.distance.cosine(vec_sentence1, vec_sentence2)
    def mapper(self, _, line):
        read_line = list(next(csv.reader([line], delimiter="\t")))
        user = read_line[0]
        sim_user = read_line[1]

        user_df = self.df[self.df["user"] == user]
        user_rest = list(user_df["rest"].values)
        sim_user_df = self.df[(self.df["user"] == sim_user)
                              & (~self.df["rest"].isin(user_rest))]

        for i in range(len(user_df)):
            rest = user_df.iloc[i]["rest"]
            la = float(user_df.iloc[i]["la"])
            lon = float(user_df.iloc[i]["lon"])
            # strip unneeded symbols
            vec = ast.literal_eval(user_df.iloc[i]["vec"])
            lsi1 = self.lsi[vec]
            for j in range(len(sim_user_df)):
                sim_rest = sim_user_df.iloc[j]["rest"]
                sim_la = float(sim_user_df.iloc[j]["la"])
                sim_lon = float(sim_user_df.iloc[j]["lon"])
                sim_vec = ast.literal_eval(sim_user_df.iloc[j]["vec"])
                lsi2 = self.lsi[sim_vec]
                sim_score = cossim(lsi1, lsi2)
                dist = haversine_distance((la, lon), (sim_la, sim_lon))
                yield None, (user, sim_user, rest, sim_rest, sim_score, dist)
Exemple #22
def calculate_lsi_requested_sim_in_df(model, dictionary, doc1ID, doc2ID,
                                      doc_data, IDfield, contentField,
    Expects a dataframe that actually holds the text associated with each doc (which
    is just passed in as strings that denote the id)
    Built in some flexibility for different field naming conventions by making
    the IDfield and contentField variables
    # sim_output = []
    # for pair in pairs:
    #     low_id = pair[0]
    #     high_id = pair[1]
    #     idea1 = lemmatize_an_idea(all_ideas[all_ideas.ideaID == low_id].idea.values[0])
    #     idea2 = lemmatize_an_idea(all_ideas[all_ideas.ideaID == high_id].idea.values[0])
    #     vec1 = model[dictionary.doc2bow(idea1)]
    #     vec2 = model[dictionary.doc2bow(idea2)]
    #     sim_output.append((low_id, high_id,cossim(vec1,vec2)))

    text1 = lemmatize_an_idea(
        doc_data[doc_data[IDfield] == doc1ID][contentField].values[0],
    text2 = lemmatize_an_idea(
        doc_data[doc_data[IDfield] == doc2ID][contentField].values[0],
    vec1 = model[dictionary.doc2bow(text1)]
    vec2 = model[dictionary.doc2bow(text2)]

    return cossim(vec1, vec2)
def cossim_pairs(topic_models, num_topics=20):
    topic_cos_map = {}

    num_topics = 20
    for i, m in enumerate(topic_models):
        cur_date = temp_dates[i]
        month = int(cur_date.split("-")[1])
        if month < 9:
        if month != 9:
        for u in range(num_topics):
            for j, n in enumerate(topic_models):
                if i == j:
                top_cs = -1
                top_topic = ""
                for v in range(num_topics):
                    cs = cossim(m.show_topic(u), n.show_topic(v))
                    if cs > top_cs:
                        top_cs = cs
                        top_topic = "{}:{}_{}:{}".format(i, u, j, v)

                topic_cos_map[top_topic] = top_cs
 def tfidf_sim(self, train_data, body_dict, threshold):
     train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
     body_dict : a dictionary of values containing {bodyID:'bodyText'}
     threshold : used distinguish between similar and not similar
     bodyText_list = list(body_dict.values())
     bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
     bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
     vocab = corpora.Dictionary(bodyText_w)
     corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
     tfidf_model = models.TfidfModel(corporaBody_bow)
     unrelated, related, y_true, y_pred = [], [], [], []
     for headline, bodyID, stance in train_data:        
         headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
         headlines_tfidf = tfidf_model[headline_bow]
         corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
         sim = cossim(headlines_tfidf, corporaBody_tfidf)
         unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
     print_results([unrelated, related, y_true, y_pred], self.model_type)      
Exemple #25
    def predict_score(self, user_id, item_id):

        ratings = Rating.objects.filter(user_id=user_id)
        rated_movies = {r['movie_id']: r['rating'] for r in ratings.values()}

        md = MovieDescriptions.objects.filter(imdb_id=item_id).first()
        rated_movies_desc = MovieDescriptions.objects.filter(

        if md is None:
            return 0

        if rated_movies_desc is None:
            return 0
        if rated_movies_desc.count() == 0:
            return 0

        top = 0.0
        bottom = 0.0

        for rm in rated_movies_desc:
            lda_vector = self.corpus[int(md.lda_vector)]
            lda_vector_sim = self.corpus[int(rm.lda_vector)]
            sim = matutils.cossim(lda_vector, lda_vector_sim)
            rating = rated_movies[rm.imdb_id]

            top += sim * float(rating)
            bottom += sim

        return top / bottom
Exemple #26
 def assign_article_topics(self, article_id, heading, process_all = False):
     """ Assign the appropriate topics to the given article in the database """
     if self._dictionary is None:
     if self._tfidf is None:
     if self._model is None:
     if self._topics is None:
     with SessionContext(commit = True) as session:
         q = session.query(Word.stem, Word.cat, Word.cnt) \
             .filter(Word.article_id == article_id).all()
         wlist = []
         for stem, cat, cnt in q:
             # Convert stem to lowercase and replace spaces with underscores
             w = w_from_stem(stem, cat)
             if cnt == 1:
                 wlist.extend([w] * cnt)
         topics = []
         article_vector = []
         if self._topics and wlist:
             bag = self._dictionary.doc2bow(wlist)
             tfidf = self._tfidf[bag]
             article_vector = self._model[tfidf]
             topic_names = []
             if self._verbose:
                 print("{0} : {1}".format(article_id, heading))
             for topic_id, topic_info in self._topics.items():
                 topic_name = topic_info["name"]
                 topic_vector = topic_info["vector"]
                 topic_threshold = topic_info["threshold"]
                 # Calculate the cosine similarity between the article and the topic
                 similarity = matutils.cossim(article_vector, topic_vector)
                 if self._verbose:
                     print("   Similarity to topic {0} is {1:.3f}".format(topic_name, similarity))
                 if similarity >= topic_threshold:
                     # Similar enough: this is a topic of the article
                     topic_names.append((topic_name, similarity))
             if topic_names and not process_all:
                 print("Article '{0}':\n   topics {1}".format(heading, topic_names))
         # Topics found (if any): delete previous ones (if any)
         session.execute(ArticleTopic.table().delete().where(ArticleTopic.article_id == article_id))
         # ...and add the new ones
         for topic_id in topics:
             session.add(ArticleTopic(article_id = article_id, topic_id = topic_id))
         # Update the indexed timestamp and the article topic vector
         a = session.query(Article).filter(Article.id == article_id).one_or_none()
         if a is not None:
             a.indexed = datetime.utcnow()
             if article_vector:
                 # Store a pure list of floats
                 topic_vector = [ t[1] for t in article_vector ]
                 a.topic_vector = json.dumps(topic_vector)
                 a.topic_vector = None
def getComparable(source_lsi_doc, target_lsi_corpus):
	sims = []
	for i in range(len(target_lsi_corpus)):
		sims.append( matutils.cossim(source_lsi_doc, target_lsi_corpus[i]) )
	sortedSims = sorted(enumerate(sims), key=lambda item: -item[1])
	topIndex = sortedSims[0][0]
	topSim = sortedSims[0][1]
	return sortedSims[0]
def title_similarity(t1, t2):
    if not t1 or not t2:
        return -2
        return cossim(
def description_similarity(d1, d2):
    if not d1 or not d2:
        return -2
        return cossim(
Exemple #30
def getComparable(source_lsi_doc, target_lsi_corpus):
    sims = []
    for i in range(len(target_lsi_corpus)):
        sims.append(matutils.cossim(source_lsi_doc, target_lsi_corpus[i]))
    sortedSims = sorted(enumerate(sims), key=lambda item: -item[1])
    topIndex = sortedSims[0][0]
    topSim = sortedSims[0][1]
    def compute_similarity(self, doc1, doc2):
        """Compute the cosine similarity between two documents.

        :doc1: a list of strings, representing the first document.
        :doc2: a list of strings, representing the second document.
        :returns: a number between -1 and 1, representing the similarity
        between the two documents.
        return cossim(self.get_vector(doc1), self.get_vector(doc2))
Exemple #33
 def assign_article_topics(self, article_id, heading):
     """ Assign the appropriate topics to the given article in the database """
     if self._dictionary is None:
     if self._tfidf is None:
     if self._model is None:
     if self._topics is None:
     with SessionContext(commit=True) as session:
         q = session.query(Word.stem, Word.cat, Word.cnt) \
             .filter(Word.article_id == article_id).all()
         wlist = []
         for stem, cat, cnt in q:
             # Convert stem to lowercase and replace spaces with underscores
             w = stem.lower().replace(" ", "_") + "/" + cat
             if cnt == 1:
                 wlist.extend([w] * cnt)
         topics = []
         if self._topics and wlist:
             bag = self._dictionary.doc2bow(wlist)
             tfidf = self._tfidf[bag]
             article_vector = self._model[tfidf]
             topic_names = []
             if self._verbose:
                 print("{0} : {1}".format(article_id, heading))
             for topic_id, topic_info in self._topics.items():
                 topic_name = topic_info["name"]
                 topic_vector = topic_info["vector"]
                 topic_threshold = topic_info["threshold"]
                 # Calculate the cosine similarity betwee the article and the topic
                 similarity = matutils.cossim(article_vector, topic_vector)
                 if self._verbose:
                     print("   Similarity to topic {0} is {1:.3f}".format(
                         topic_name, similarity))
                 if similarity >= topic_threshold:
                     # Similar enough: this is a topic of the article
                     topic_names.append((topic_name, similarity))
             if topic_names:
                 print("Article '{0}': topics {1}".format(
                     heading, topic_names))
         # Topics found (if any): delete previous ones (if any)
             ArticleTopic.article_id == article_id))
         # ...and add the new ones
         for topic_id in topics:
                 ArticleTopic(article_id=article_id, topic_id=topic_id))
         # Update the indexed timestamp
         a = session.query(Article).filter(
             Article.id == article_id).one_or_none()
         if a:
def getMaxSimilarity(dictTopic, vector):
    maxValue = 0
    maxIndex = -1
    for k, cluster in dictTopic.items():
        oneSimilarity = np.mean([matutils.cossim(vector, v) for v in cluster])
        if oneSimilarity > maxValue:
            maxValue = oneSimilarity
            maxIndex = k
    return maxIndex, maxValue
Exemple #35
def wordsim(left, right):
    leftvec = unidict.doc2bow(left.lower().split())
    rightvec = unidict.doc2bow(right.lower().split())
    leftlsi = unilsi[leftvec]
    rightlsi = unilsi[rightvec]
    #leftlda = unilda[leftvec] # matutils.sparse2full(..., 300)
    #rightlda = unilda[rightvec]

def vsm_dist(song_A, song_B):
	# try:
	tif = models.TfidfModel(a_corps)
	a_tif = tif[song_A['tokenized_comments']]
	b_tif = tif[song_B['tokenized_comments']]
	dist =  matutils.cossim(a_tif, b_tif)
	if dist == 0: 
		dist = 0.0000001#avoid the div by 0
	return dist
Exemple #37
    def get_max_similarity(self, article):
        title_content_word_tfidfs = article.title_content_effective_word_tfidfs
        title_content_max_sim = 0
        title_content_max_sim_cluster_id = -1

        content_word_tfidfs = article.content_effective_word_tfidfs
        content_max_sim = 0
        content_max_sim_cluster_id = -1

        title_word_tfidfs = article.title_effective_word_tfidfs
        title_max_sim = 0
        title_max_sim_cluster_id = -1

        for i in np.arange(len(self.clusters)):
            cluster = self.clusters[i]
            # title_content
            title_content_similarity = np.mean([
                for article in cluster.articles
            if title_content_similarity > title_content_max_sim:
                title_content_max_sim = title_content_similarity
                title_content_max_sim_cluster_id = i

            content_similarity = np.mean([
                for article in cluster.articles
            if content_similarity > content_max_sim:
                content_max_sim = content_similarity
                content_max_sim_cluster_id = i

            title_similarity = np.mean([
                for article in cluster.articles
            if title_similarity > title_max_sim:
                title_max_sim = title_similarity
                title_max_sim_cluster_id = i

 def _get_doc_similarity(self, doc1_tk, doc2_tk):
     :param doc1_tk: Preprocessed documents as tokens
     :param doc2_tk: Preprocessed documents as tokens
     dis1 = self.get_topic_distrb(doc1_tk)
     dis2 = self.get_topic_distrb(doc2_tk)
     # return 1 - matutils.hellinger(dis1, dis2)
    def similarity_lsi(self, text1, text2):
        # convert text into bag of words model
        text1_bow = self.__dictionary.doc2bow(self.__preprocess_text_document(text1))
        text2_bow = self.__dictionary.doc2bow(self.__preprocess_text_document(text2))

        # transform text into the model's domain
        text1_model = self.__model_lsi[text1_bow]
        text2_model = self.__model_lsi[text2_bow]

 def reducer(self, key, value):
     lst_of_rvws = list(value)[0]
     rvws1 = lst_of_rvws[0]
     rvws2 = lst_of_rvws[1]
     # print(rvws1)
     sim = cossim(self.dct.doc2bow(rvws1), self.dct.doc2bow(rvws2))
     # similarity.append((biz[i], biz[j], sim))
     # print('reducer')
     join_key = str(key[0]) + '\t' + str(key[1])
def calculate_lsi_requested_sim(model, dictionary, text1, text2, stoplist):
    Base version that just takes in two strings and spits out a similarity in the provided gensim model space

    text1_lemm = lemmatize_an_idea(text1, stoplist)
    text2_lemm = lemmatize_an_idea(text2, stoplist)
    vec1 = model[dictionary.doc2bow(text1_lemm)]
    vec2 = model[dictionary.doc2bow(text2_lemm)]

    return cossim(vec1, vec2)
def get_abstract_similarity(row):
    if pd.notnull(row['abstract_lr']) and pd.notnull(row['abstract_cp']):
        lr_doc = preprocess_doc(row['abstract_lr'])
        cp_doc = preprocess_doc(row['abstract_cp'])
        lr_bow = abstract_dict.doc2bow(lr_doc)
        cp_bow = abstract_dict.doc2bow(cp_doc)
        lr_lsi = abstract_lsi[lr_bow]
        cp_lsi = abstract_lsi[cp_bow]
        return matutils.cossim(lr_lsi, cp_lsi)
def calculate_best_score(candidate_id, reference_ids):
    # translate candidate to LSI vector
    vec_lsi = id2lsi_vec(candidate_id)

    # determine similarities for each of the references
    scores = [
        matutils.cossim(vec_lsi, id2lsi_vec(reference_id))
        for reference_id in reference_ids

    return max(scores)
 def getMaxSimilarity(self, dictTopic, vector):
     # 计算新进入文档和已有文档的文本相似度,这里的相似度采用的是cosine余弦相似度
     maxValue = 0
     maxIndex = -1
     for k, cluster in dictTopic.items():
         oneSimilarity = np.mean(
             [matutils.cossim(vector, v) for v in cluster])
         if oneSimilarity > maxValue:
             maxValue = oneSimilarity
             maxIndex = k
    def compare_strings(self, s1, s2):
        # Build vector for doc1 
        vec_bow1 = self.dictionary.doc2bow(s1.lower().split())
        vec_lsi1 = self.lsi[vec_bow1] # convert the query to LSI space
        # Build vector for doc2
        vec_bow2 = self.dictionary.doc2bow(s2.lower().split())
        vec_lsi2 = self.lsi[vec_bow2]

        # Calculate cosine similarity
        sim = matutils.cossim(vec_lsi1, vec_lsi2)
 def get_link_scores(self, source, target):
     :param doc1_tk: Preprocessed documents as tokens
     :param doc2_tk: Preprocessed documents as tokens
     doc1_tk = source['tokens'].split()
     doc2_tk = target['tokens'].split()
     dis1 = self.get_topic_distrb(doc1_tk)
     dis2 = self.get_topic_distrb(doc2_tk)
     # return 1 - matutils.hellinger(dis1, dis2)
     return matutils.cossim(dis1, dis2)
def loadDocuments():
	documents = []

		for transcriptPath in os.listdir(info[0]):
			document = loadFileIntoList(info[0] + "/" + transcriptPath, info[1], info[2], info[3])
	for doc in documents:
		last_sentence_lsi = []
		last_sentence_lda = []
		for sentence in doc:
			sentence_bow = dictionary.doc2bow(sentence.split(" "))
			sim_lsi = matutils.cossim(lsi[tfidf[sentence_bow]], last_sentence_lsi)
			sim_lda = matutils.cossim(lda[sentence_bow], last_sentence_lda)
			last_sentence_lsi = lsi[tfidf[sentence_bow]]
def ComputerSimilarityOld(model, corpus, blocks):
	#print blocks;
	vectors = [];
	##There are 2 blocks, the one before the sentence and the one after

	for block in blocks:
		words = corpus.proc(block)
		topic_distribution = {};
		words_culled = [];
		for word in words:
			block_lda = model[corpus.dictionary.doc2bow(corpus.proc(word))];
			topic_id = 0;
			prob = 0;
			for t in block_lda:
				if t[1] > prob:
					topic_id = t[0];
					prob = t[1]

			if (prob < .1):

			ppp = 0;
			word_id = corpus.dictionary.token2id[word];
			if WordProbThreshold:
				terms = model.get_topic_terms(topic_id,100000);
				for term in terms:
					if term[0] == word_id:
						ppp = term[1];

			if (ppp < X and WordProbThreshold):
				print("CUT  " + word + "    " + str(ppp));

			print(word + "    " + str(ppp));
			if topic_distribution.has_key(topic_id):
				topic_distribution[topic_id] = topic_distribution[topic_id]+1;
				topic_distribution[topic_id] = 1;
			#print((topic_id, prob));


	dot_product = matutils.cossim(vectors[0],vectors[1])
	return dot_product;
    def score(query, profile, data=None):
        if not len(profile.description):
            return [-1]

        vectorspace = VectorSpace([])

        tokenized_query = LowerTokenizer.tokenize(query)
        tokenized_description = LowerTokenizer.tokenize(profile.description)

        query_vector = vectorspace.vector_for_document(

        description_vector = vectorspace.vector_for_document(

    def test_get_similarities(self, mock_q_simmx_file_path, mock_a_simmx_file_path, mock_dict_file_path, mock_md_file_path):
        mock_md_file_path.return_value = self.test_md_file_path
        mock_dict_file_path.return_value = self.test_dict_file_path
        mock_q_simmx_file_path.return_value = self.test_q_simmx_file_path
        mock_a_simmx_file_path.return_value = self.test_a_simmx_file_path

        model_struct = TfIdfModelStruct.get_model(data_store=self.data_store)

        query_doc = "Is brocolli tasty to eat?"
        compare_docs = self.data_store.doc_set

        sims = model_struct.get_similarities(query_doc, compare_docs)

        for idx, sim in enumerate(sims):
            expected_sim = cossim(
            self.assertAlmostEqual(sim[1], expected_sim)
Exemple #51
def tfidf_distance(corpora, tfidf, tfidf_web, mean_vec, web_2, loss_weight):
    """compute the distance (as a function of cosine similarity) between two websites using tfidf model"""

    if len(mean_vec) == 0:
        return loss_weight

        indx_2 = tfidf_web.values().index(web_2)
    except ValueError:
        return loss_weight

    doc_num_2 = tfidf_web.keys()[indx_2]

    bow_2 = corpora[doc_num_2]

    tf_rap_2 = matutils.unitvec(tfidf[bow_2])       # get its tfidf representation

    cosine_sim = min(matutils.cossim(mean_vec, tf_rap_2), 1.0)

    def value_for_text(self, t, rp=default_rp):
        space = rp.lsa_space()
        num_topics = space.num_topics

        tokens = rp.tokens(t)
        tokens = [[token.lower() for token in sentence] for sentence in tokens]

        if len(tokens) < 2:
            return 0

        spans = np.zeros(len(tokens) - 1)
        for i in range(1, len(tokens)):
            past_sentences = tokens[:i]
            span_dim = len(past_sentences)

            if span_dim > num_topics - 1:
                # It's not clear, from the papers I read, what should be done
                # in this case. I did what seemed to not imply in loosing
                # information.
                beginning = past_sentences[0:span_dim - num_topics]
                past_sentences[0] = list(chain.from_iterable(beginning))

            past_vectors = [sparse2full(space.get_vector(sent), num_topics)
                            for sent in past_sentences]

            curr_vector = sparse2full(space.get_vector(tokens[i]), num_topics)
            curr_array = np.array(curr_vector).reshape(num_topics, 1)

            A = np.array(past_vectors).transpose()

            projection_matrix = dot(dot(A,

            projection = dot(projection_matrix, curr_array).ravel()

            spans[i - 1] = cossim(full2sparse(curr_vector),

def cosine_by_tfidf(list1, list2, feature):

    cosine_value = 0.0

    intersection_set = set(list1) & set(list2)

    if intersection_set:
        union_set = set(list1) | set(list2)

        feature_count = {}
        for item in union_set:
            feature_count[item] = reference_dict[feature][item]
            # feature_count[item] = collection.find({feature: {'$in': [item]}}).count()

        list1_dict = dict(Counter(list1))
        list2_dict = dict(Counter(list2))

        # pprint(list1_dict)
        # pprint(list2_dict)

        num1 = len(list1)
        num2 = len(list2)

        list1_tfidf_dict = {}
        for k, v in list1_dict.items():
            list1_tfidf_dict[k] = (v / num1) * math.log(movie_count / feature_count[k])

        list2_tfidf_dict = {}
        for k, v in list2_dict.items():
            list2_tfidf_dict[k] = (v / num2) * math.log(movie_count / feature_count[k])

        # pprint(list1_tfidf_dict)
        # pprint(list2_tfidf_dict)

        cosine_value = matutils.cossim(list1_tfidf_dict, list2_tfidf_dict)

    # print cosine_value
    def query(self, query, sorted=False):
        """Given a search query, returns a list of document IDs and their
        cosine similarity to that query.

        If sorted is True, results are returned sorted in descending order of
        cosine similarity. Otherwise, results are returned ordered by document
        tokenized = self.tokenizer.tokenize(query)
        query_vector = self.vectorspace.vector_for_document(tokenized,

        results = []
        for index, document in enumerate(self.documents):
            tokenized = self.tokenizer.tokenize(document)
            document_vector = self.vectorspace.vector_for_document(tokenized)
            similarity = cossim(query_vector, document_vector)
            results.append((index, similarity))

        if sorted:
            return sorted(results, key=lambda item: -item[1])

        labels = []
        confidences = []
        for test_lda in X_test:
            similarities = map(
                lambda article_lda: cossim(article_lda, test_lda), self.X_train_lda)
            similarityByLabel = sorted(
                zip(similarities, self.y_train_labels), key=lambda item: item[0], reverse=True)

            chosen_label = similarityByLabel[0][1]
            max_similarity = similarityByLabel[0][0]

            similarityByLabelForNotChosen = [
                item for item in similarityByLabel if item[1] != chosen_label]

            confidence = min_confidence
                confidence = max_similarity - \

        print "Cossine sim"
        simMatrixCos = []

        for x in range(0,50):
            topicMatrixCos = []
            for y in range(0,50):
                model = self.model

                vec1 = model.get_topic_terms(x, topn=model.num_terms)

                #ldaVec1 = sorted(model.get_topic_terms(x, topn=model.num_terms))
                #ldaVec2 = sorted(model.get_topic_terms(y, topn=model.num_terms))

                ldaVec1 = model.get_topic_terms(x, topn = N)
                ldaVec2 = model.get_topic_terms(y, topn = N)

                #dense1 = gensim.matutils.sparse2full(ldaVec1, model.num_terms)
                #dense2 = gensim.matutils.sparse2full(ldaVec2, model.num_terms)

                sim = matutils.cossim(ldaVec1, ldaVec2)
                #simDict = (x, y, sim, self.model.show_topic(y))
                simDict = (x, y, sim)


            simMatrixCos.append(sorted(topicMatrixCos, key=itemgetter(2), reverse=True))

        #for element in simMatrix:
        #    print simMatrix
        #topic1Sorted = sorted(simMatrixCos[0], key=itemgetter(2))
        #x = topic1Sorted
        return simMatrixCos
