Exemple #1
0
def algorithm_HAC(pairs):
    print "algorithm HAC"
    #Google
    # hac = AgglomerativeClustering(n_clusters=300, linkage='ward', affinity='euclidean')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='euclidean')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='l1')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='l2')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='manhattan')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='cosine')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='complete', affinity='precomputed') # not yet
    # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='euclidean')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='l1')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='l2')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='manhattan')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='cosine')
    # hac = AgglomerativeClustering(n_clusters=300, linkage='average', affinity='precomputed') # not yet

    if len(pairs) == 2:
        hac.fit(pairs[1])
    else:
        hac.fit(pairs)

    print "HAC"

    # Google
    # print "ARI: " + str(metrics.adjusted_rand_score(GoogleNewsParser.get_target_labels(), hac.labels_))
    # print "NMI: " + str(metrics.adjusted_mutual_info_score(GoogleNewsParser.get_target_labels(), hac.labels_))

    print "ARI: " + str(metrics.adjusted_rand_score(JsonParser.get_target_labels(), hac.labels_))
    print "NMI: " + str(metrics.adjusted_mutual_info_score(JsonParser.get_target_labels(), hac.labels_))
Exemple #2
0
def run_hicocluster_create_matrix():
    # Number of docs: 1950
    # Number of items: 21826
    texts = JsonParser.get_texts(os.getcwd() + "\\clusters")
    newTexts = []
    for text in texts:
        newTexts.append(text.split())
    # print newTexts[0]

    dictionary = corpora.Dictionary(newTexts)
    dictionary.save(os.getcwd() + "\\dictionary.dict")

    corpus = [dictionary.doc2bow(text) for text in newTexts]
    corpora.MmCorpus.serialize(os.getcwd() + "\\corpus.mm", corpus)
    print "length of docs: " + str(dictionary.num_docs)
    print "length of items: " + str(len(dictionary.token2id.items()))

    features = len(dictionary.token2id.items())
    row = 1
    set_doc_terms = []
    for doc in corpus:
        doc_terms = [0] * features
        if len(doc) > 0:
            row += 1
            for term in doc:
                doc_terms[term[0]] = term[1]
            set_doc_terms.append(doc_terms)
    matrix = open(os.getcwd() + "\\matrix.txt", "w")
    for line in set_doc_terms:
        for i in range(len(line)):
            matrix.write(str(line[i]) + " ")
        matrix.write("\n")
    matrix.close()
Exemple #3
0
def get_combination():
    print "run_combination"
    # Google data
    # parser = GoogleNewsParser.NewsParsers()
    # parser.parse_data_from_tok()

    # Json Google
    tfidf = ExTFIDF.TfIdf()
    # tfidf.fit_data(parser.get_texts())
    tfidf.fit_data(JsonParser.get_texts(os.getcwd() + "\\" + "clusters"))
    tf_vectors = tfidf.get_data_as_vector()
    print "Length of tfidf feature: " + str(len(tf_vectors[0]))
    # print tf_vectors[0]

    pairs = load_d2v()
    single = pairs[1]
    print "Length of doc2vec feature: " + str(len(single[0]))
    # print single[0]

    final = numpy.hstack((tf_vectors, single))
    print "Length of final features: " + str(len(final[0]))
    # final = []
    # for i in range(length):
    #     temp = tf_vectors[i] + single[i]
    #     final.append(temp)
    return final
Exemple #4
0
def run_doc2vec():
    print "run_doc2vec"
    path = os.getcwd() + "\\clusters"

    # raw clusters
    # pairs = GoogleNewsParser.get_docs_labels(path=path)

    # clusters from json
    pairs = JsonParser.get_docs_labels(path=path)

    documents = ExD2V.DocIterator(pairs[0], pairs[1])
    # model = Doc2Vec(size=100, window=10, min_count=1, workers=4, alpha=0.025, min_alpha=0.025)
    model = Doc2Vec(size=200,
                    window=10,
                    min_count=1,
                    workers=4,
                    alpha=0.025,
                    min_alpha=0.025)
    model.build_vocab(documents)
    for epoch in range(10):
        model.train(documents)
        model.alpha -= 0.002
        model.min_alpha = model.alpha
    print "length of model : " + str(len(model.docvecs))
    model.save(os.getcwd() + "\\google.d2v")
Exemple #5
0
def algorithm_Kmean(pairs):
    # Number of full clusters: 2305
    # Number of half clusters: 1236
    # Number of quad clusters: 495 : 4 - 10
    # Number of eight: 262 : 4 - 6
    # Number of one cluster: 147
    # Number of json cluster: 300
    print "algorithm Kmean"
     #Google
    # kmeans = KMeans(n_clusters=300, init='k-means++', n_init=100)
    # kmeans = KMeans(n_clusters=300, init='random', n_init=100)
    #ndarray
    kmeans = KMeans(n_clusters=300, n_init=100) # not yet
    if (len(pairs) == 2):
        #ndarray
        X = np.array(pairs[1])
        kmeans.fit(X)
        # kmeans.fit(pairs[1])
    else:
        # ndarray
        X = np.array(pairs)
        kmeans.fit(X)
        # kmeans.fit(pairs)
    print "Kmeans"


    # print "Original label"
    # print GoogleNewsParser.get_target_labels()
    print "Target labels length : " + str(len(JsonParser.get_target_labels()))
    print "Kmeans label lenght : " + str(len(kmeans.labels_))
    # print kmeans.labels_
    # Google
    # print "ARI: " + str(metrics.adjusted_rand_score(GoogleNewsParser.get_target_labels(), kmeans.labels_))
    # print "NMI: " + str(metrics.adjusted_mutual_info_score(GoogleNewsParser.get_target_labels(), kmeans.labels_))

    print "ARI: " + str(metrics.adjusted_rand_score(JsonParser.get_target_labels(), kmeans.labels_))
    print "NMI: " + str(metrics.adjusted_mutual_info_score(JsonParser.get_target_labels(), kmeans.labels_))
Exemple #6
0
def algorithm_tfidf():
    print "Running TFIDF"
    # Google data
    # parser = GoogleNewsParser.NewsParsers()
    # parser.parse_data_from_tok()

    # Json Google
    tfidf = ExTFIDF.TfIdf()
    # tfidf.fit_data(parser.get_texts())
    tfidf.fit_data(JsonParser.get_texts(os.getcwd() + "\\" + "clusters"))

    print "lennth of tfidf : " + str(len(tfidf.get_data_as_vector()))

    print "Running algorithm with TFIDF"
    Algorithm.algorithm_Kmean(tfidf.get_data_as_vector())
Exemple #7
0
def load_d2v():
    print "load_d2v"
    pairs = []
    module = Doc2Vec.load(os.getcwd() + "\\google.d2v")
    print "length " + str(len(module.docvecs))

    # raw clusters
    # labels = GoogleNewsParser.get_only_labels(os.getcwd() + "\\clusters")

    # json clusters
    labels = JsonParser.get_only_labels(os.getcwd() + "\\clusters")

    docs = []
    for i in range(len(labels)):
        docs.append(module.docvecs[labels[i]])
    pairs.append(labels)
    pairs.append(docs)
    return pairs