Esempio n. 1
0
    def init_text_to_text_links(self):
        logging.info('Finding text to text links for {NAME}'.format(NAME=self.name()))
        lemmatizer = Lemmatizer()
        index = 0
        for _news in self.news.get_documents():
            _news.index = index
            index += 1

        for tweet in self.tweets.get_documents():
            tweet.words = lemmatizer.split_text_to_lemmas(tweet.text)
            tweet.index = index
            index += 1

        #print len(tweets), len(news)
        similarity_matrix = get_similarity_matrix(self.get_documents(), self.get_documents(), self.corpus, self.tf_idf_matrix)
        #print 'preparation finished'
        self.text_to_text_links = get_text_to_text_relation(self.news.get_documents(), self.tweets.get_documents(), similarity_matrix)
Esempio n. 2
0
from sklearn.feature_extraction.text import TfidfVectorizer

from twnews.dataset.storage import NewsStorage
from twnews.utils.text_processors import Lemmatizer

news_storage = news_storage = NewsStorage()
texts = news_storage.get_texts()

#constants
wm = 1e-2

# lemmatize
lemmatizer = Lemmatizer()
lemmas_list = lemmatizer.split_texts_to_lemmas(texts)
texts = [' '.join(lemma) for lemma in lemmas_list]

# build X and corpus
tvf = TfidfVectorizer()
tfidf_matrix = tvf.fit_transform(texts)

X = tfidf_matrix.transpose()
corpus = tvf.get_feature_names()

import numpy as np
from twnews.utils.timeit import timeit

@timeit
def build_weight_matrix(matrix):
    '''Slow and ugly realization TODO: rewrite'''
    F = X.copy().todense().tolist()
    W = []