def algorithm_HAC(pairs): print "algorithm HAC" #Google # hac = AgglomerativeClustering(n_clusters=300, linkage='ward', affinity='euclidean') # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='euclidean') # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='l1') # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='l2') # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='manhattan') # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='cosine') # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='precomputed') # not yet # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='euclidean') # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='l1') # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='l2') # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='manhattan') # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='cosine') # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='precomputed') # not yet if len(pairs) == 2: hac.fit(pairs[1]) else: hac.fit(pairs) print "HAC" # Google # print "ARI: " + str(metrics.adjusted_rand_score(GoogleNewsParser.get_target_labels(), hac.labels_)) # print "NMI: " + str(metrics.adjusted_mutual_info_score(GoogleNewsParser.get_target_labels(), hac.labels_)) print "ARI: " + str(metrics.adjusted_rand_score(JsonParser.get_target_labels(), hac.labels_)) print "NMI: " + str(metrics.adjusted_mutual_info_score(JsonParser.get_target_labels(), hac.labels_))
def run_hicocluster_create_matrix(): # Number of docs: 1950 # Number of items: 21826 texts = JsonParser.get_texts(os.getcwd() + "\\clusters") newTexts = [] for text in texts: newTexts.append(text.split()) # print newTexts[0] dictionary = corpora.Dictionary(newTexts) dictionary.save(os.getcwd() + "\\dictionary.dict") corpus = [dictionary.doc2bow(text) for text in newTexts] corpora.MmCorpus.serialize(os.getcwd() + "\\corpus.mm", corpus) print "length of docs: " + str(dictionary.num_docs) print "length of items: " + str(len(dictionary.token2id.items())) features = len(dictionary.token2id.items()) row = 1 set_doc_terms = [] for doc in corpus: doc_terms = [0] * features if len(doc) > 0: row += 1 for term in doc: doc_terms[term[0]] = term[1] set_doc_terms.append(doc_terms) matrix = open(os.getcwd() + "\\matrix.txt", "w") for line in set_doc_terms: for i in range(len(line)): matrix.write(str(line[i]) + " ") matrix.write("\n") matrix.close()
def get_combination(): print "run_combination" # Google data # parser = GoogleNewsParser.NewsParsers() # parser.parse_data_from_tok() # Json Google tfidf = ExTFIDF.TfIdf() # tfidf.fit_data(parser.get_texts()) tfidf.fit_data(JsonParser.get_texts(os.getcwd() + "\\" + "clusters")) tf_vectors = tfidf.get_data_as_vector() print "Length of tfidf feature: " + str(len(tf_vectors[0])) # print tf_vectors[0] pairs = load_d2v() single = pairs[1] print "Length of doc2vec feature: " + str(len(single[0])) # print single[0] final = numpy.hstack((tf_vectors, single)) print "Length of final features: " + str(len(final[0])) # final = [] # for i in range(length): # temp = tf_vectors[i] + single[i] # final.append(temp) return final
def run_doc2vec(): print "run_doc2vec" path = os.getcwd() + "\\clusters" # raw clusters # pairs = GoogleNewsParser.get_docs_labels(path=path) # clusters from json pairs = JsonParser.get_docs_labels(path=path) documents = ExD2V.DocIterator(pairs[0], pairs[1]) # model = Doc2Vec(size=100, window=10, min_count=1, workers=4, alpha=0.025, min_alpha=0.025) model = Doc2Vec(size=200, window=10, min_count=1, workers=4, alpha=0.025, min_alpha=0.025) model.build_vocab(documents) for epoch in range(10): model.train(documents) model.alpha -= 0.002 model.min_alpha = model.alpha print "length of model : " + str(len(model.docvecs)) model.save(os.getcwd() + "\\google.d2v")
def algorithm_Kmean(pairs): # Number of full clusters: 2305 # Number of half clusters: 1236 # Number of quad clusters: 495 : 4 - 10 # Number of eight: 262 : 4 - 6 # Number of one cluster: 147 # Number of json cluster: 300 print "algorithm Kmean" #Google # kmeans = KMeans(n_clusters=300, init='k-means++', n_init=100) # kmeans = KMeans(n_clusters=300, init='random', n_init=100) #ndarray kmeans = KMeans(n_clusters=300, n_init=100) # not yet if (len(pairs) == 2): #ndarray X = np.array(pairs[1]) kmeans.fit(X) # kmeans.fit(pairs[1]) else: # ndarray X = np.array(pairs) kmeans.fit(X) # kmeans.fit(pairs) print "Kmeans" # print "Original label" # print GoogleNewsParser.get_target_labels() print "Target labels length : " + str(len(JsonParser.get_target_labels())) print "Kmeans label lenght : " + str(len(kmeans.labels_)) # print kmeans.labels_ # Google # print "ARI: " + str(metrics.adjusted_rand_score(GoogleNewsParser.get_target_labels(), kmeans.labels_)) # print "NMI: " + str(metrics.adjusted_mutual_info_score(GoogleNewsParser.get_target_labels(), kmeans.labels_)) print "ARI: " + str(metrics.adjusted_rand_score(JsonParser.get_target_labels(), kmeans.labels_)) print "NMI: " + str(metrics.adjusted_mutual_info_score(JsonParser.get_target_labels(), kmeans.labels_))
def algorithm_tfidf(): print "Running TFIDF" # Google data # parser = GoogleNewsParser.NewsParsers() # parser.parse_data_from_tok() # Json Google tfidf = ExTFIDF.TfIdf() # tfidf.fit_data(parser.get_texts()) tfidf.fit_data(JsonParser.get_texts(os.getcwd() + "\\" + "clusters")) print "lennth of tfidf : " + str(len(tfidf.get_data_as_vector())) print "Running algorithm with TFIDF" Algorithm.algorithm_Kmean(tfidf.get_data_as_vector())
def load_d2v(): print "load_d2v" pairs = [] module = Doc2Vec.load(os.getcwd() + "\\google.d2v") print "length " + str(len(module.docvecs)) # raw clusters # labels = GoogleNewsParser.get_only_labels(os.getcwd() + "\\clusters") # json clusters labels = JsonParser.get_only_labels(os.getcwd() + "\\clusters") docs = [] for i in range(len(labels)): docs.append(module.docvecs[labels[i]]) pairs.append(labels) pairs.append(docs) return pairs