def run_hicocluster_create_matrix(): # Number of docs: 1950 # Number of items: 21826 texts = JsonParser.get_texts(os.getcwd() + "\\clusters") newTexts = [] for text in texts: newTexts.append(text.split()) # print newTexts[0] dictionary = corpora.Dictionary(newTexts) dictionary.save(os.getcwd() + "\\dictionary.dict") corpus = [dictionary.doc2bow(text) for text in newTexts] corpora.MmCorpus.serialize(os.getcwd() + "\\corpus.mm", corpus) print "length of docs: " + str(dictionary.num_docs) print "length of items: " + str(len(dictionary.token2id.items())) features = len(dictionary.token2id.items()) row = 1 set_doc_terms = [] for doc in corpus: doc_terms = [0] * features if len(doc) > 0: row += 1 for term in doc: doc_terms[term[0]] = term[1] set_doc_terms.append(doc_terms) matrix = open(os.getcwd() + "\\matrix.txt", "w") for line in set_doc_terms: for i in range(len(line)): matrix.write(str(line[i]) + " ") matrix.write("\n") matrix.close()
def get_combination(): print "run_combination" # Google data # parser = GoogleNewsParser.NewsParsers() # parser.parse_data_from_tok() # Json Google tfidf = ExTFIDF.TfIdf() # tfidf.fit_data(parser.get_texts()) tfidf.fit_data(JsonParser.get_texts(os.getcwd() + "\\" + "clusters")) tf_vectors = tfidf.get_data_as_vector() print "Length of tfidf feature: " + str(len(tf_vectors[0])) # print tf_vectors[0] pairs = load_d2v() single = pairs[1] print "Length of doc2vec feature: " + str(len(single[0])) # print single[0] final = numpy.hstack((tf_vectors, single)) print "Length of final features: " + str(len(final[0])) # final = [] # for i in range(length): # temp = tf_vectors[i] + single[i] # final.append(temp) return final
def algorithm_tfidf(): print "Running TFIDF" # Google data # parser = GoogleNewsParser.NewsParsers() # parser.parse_data_from_tok() # Json Google tfidf = ExTFIDF.TfIdf() # tfidf.fit_data(parser.get_texts()) tfidf.fit_data(JsonParser.get_texts(os.getcwd() + "\\" + "clusters")) print "lennth of tfidf : " + str(len(tfidf.get_data_as_vector())) print "Running algorithm with TFIDF" Algorithm.algorithm_Kmean(tfidf.get_data_as_vector())