Esempio n. 1
0
from sklearn.feature_extraction.text import TfidfVectorizer

from twnews.dataset.storage import NewsStorage
from twnews.utils.text_processors import Lemmatizer

news_storage = news_storage = NewsStorage()
texts = news_storage.get_texts()

#constants
wm = 1e-2

# lemmatize
lemmatizer = Lemmatizer()
lemmas_list = lemmatizer.split_texts_to_lemmas(texts)
texts = [' '.join(lemma) for lemma in lemmas_list]

# build X and corpus
tvf = TfidfVectorizer()
tfidf_matrix = tvf.fit_transform(texts)

X = tfidf_matrix.transpose()
corpus = tvf.get_feature_names()

import numpy as np
from twnews.utils.timeit import timeit

@timeit
def build_weight_matrix(matrix):
    '''Slow and ugly realization TODO: rewrite'''
    F = X.copy().todense().tolist()
    W = []