def init_text_to_text_links(self): logging.info('Finding text to text links for {NAME}'.format(NAME=self.name())) lemmatizer = Lemmatizer() index = 0 for _news in self.news.get_documents(): _news.index = index index += 1 for tweet in self.tweets.get_documents(): tweet.words = lemmatizer.split_text_to_lemmas(tweet.text) tweet.index = index index += 1 #print len(tweets), len(news) similarity_matrix = get_similarity_matrix(self.get_documents(), self.get_documents(), self.corpus, self.tf_idf_matrix) #print 'preparation finished' self.text_to_text_links = get_text_to_text_relation(self.news.get_documents(), self.tweets.get_documents(), similarity_matrix)
from sklearn.feature_extraction.text import TfidfVectorizer from twnews.dataset.storage import NewsStorage from twnews.utils.text_processors import Lemmatizer news_storage = news_storage = NewsStorage() texts = news_storage.get_texts() #constants wm = 1e-2 # lemmatize lemmatizer = Lemmatizer() lemmas_list = lemmatizer.split_texts_to_lemmas(texts) texts = [' '.join(lemma) for lemma in lemmas_list] # build X and corpus tvf = TfidfVectorizer() tfidf_matrix = tvf.fit_transform(texts) X = tfidf_matrix.transpose() corpus = tvf.get_feature_names() import numpy as np from twnews.utils.timeit import timeit @timeit def build_weight_matrix(matrix): '''Slow and ugly realization TODO: rewrite''' F = X.copy().todense().tolist() W = []