def main(args): vector_space = {} file_list_vs = open("vector_space.txt", "r") vector_space = JSONDecoder().decode(file_list_vs.read()) file_list_vs.close() documents = {} file_list_documents = open("doc_wt_sw.txt", "r") documents = JSONDecoder().decode(file_list_documents.read()) file_list_documents.close() dic = [] file_dic = open("words.txt", "r") dic = JSONDecoder().decode(file_dic.read()) file_dic.close() tfidf = {} inv_frec_vector = [] doc_lenght = len(vector_space) t0 = time() for word in dic: count = 0 for document in documents.values(): if word in document: count += 1 inv_frec = log(doc_lenght / count) inv_frec_vector.append(inv_frec) print("done in %0.3fs." % (time() - t0)) t0 = time() for key, value in vector_space.items(): newtable = [] for id, ter_frec in enumerate(value): eq = 0 if ter_frec > 0: eq = ter_frec * inv_frec_vector[id] newtable.append(eq) tfidf[key] = newtable print("done in %0.3fs." % (time() - t0)) file = open("tfidf.txt", "w") file.write(JSONEncoder().encode(tfidf)) file.close() file = open("inv_frec_vector.txt", "w") file.write(JSONEncoder().encode(inv_frec_vector)) file.close()
n_topics = 10 n_top_words = 30 t0 = time() print("Loading dataset and extracting TF-IDF features...") file_stopwords = open("spanish.txt", "r", encoding='utf-8') stopwords = file_stopwords.read().split() file_stopwords.close() f = open('corpus.txt', 'r') corpus = JSONDecoder().decode(f.read()) f.close() dataset = [] for documents in corpus.values(): dataset.append(documents.lower()) vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words=stopwords) tfidf = vectorizer.fit_transform(dataset[:n_samples]) print("done in %0.3fs." % (time() - t0)) print() print("Fitting the NMF model with n_samples=%d and n_features=%d..." % (n_samples, n_features))