if result is not None:
            vec = np.fromstring(result,dtype=np.float32)
            embeddings_dic[i] = vec
        else:
            count += 1
            embeddings_dic[i] = getRandom_vec()
    # print(count)
    return embeddings_dic


corpus,dic,labels = load_data.load_corpus()
# TF-IDF
tfidf = gensim.models.TfidfModel(corpus=corpus,dictionary=dic)
# corpus_tfidf = [tfidf[doc] for doc in corpus]
corpus_tfidf = tfidf[corpus]
corpus_top = get_tfidf_top(corpus_tfidf,top_number=10)

dic_embeddings = get_dic_embeddings(dic)
doc_embeddings = model_util.get_doc_embeddings(corpus_top,dic_embeddings)
train_data,train_label,test_data,test_label = load_data.get_train_test(doc_embeddings,labels)
print("train size: "+str(train_data.shape[0]))
print("test size: "+str(test_data.shape[0]))

clf = LinearSVC()
clf.fit(train_data,train_label)
score = clf.score(test_data,test_label)
print(score)



Esempio n. 2
0
    return matrix


corpus,dic,labels = load_data.load_corpus()

"""
# TF-IDF
tfidf = gensim.models.TfidfModel(corpus=corpus,dictionary=dic)
corpus_tfidf = tfidf[corpus]
# LDA
lda_model = gensim.models.LdaModel(corpus_tfidf,num_topics=4,id2word=dic)
"""

lda_model = gensim.models.wrappers.LdaMallet('F:/mallet-2.0.8/bin/mallet.bat',corpus=corpus,num_topics=4,id2word=dic)

doc_topics = []
for doc in corpus:
    doc_topics.append(lda_model.get_document_topics(doc,minimum_probability=0))

doc_topics_matrix = get_corpus_topic_distribution(doc_topics,num_topic=4)
train_data,train_label,test_data,test_label = load_data.get_train_test(doc_topics_matrix,labels)
print("train size: "+str(train_data.shape[0]))
print("test size: "+str(test_data.shape[0]))

# SVM classification
# clf = SVC()
clf = LinearSVC()
clf.fit(train_data,train_label)
score = clf.score(test_data,test_label)
print(score)
Esempio n. 3
0
# -*- coding: utf-8 -*-

__author__ = 'PC-LiNing'

import gensim
from lda import load_data
import numpy as np
from sklearn.lda import LDA

corpus,dic,labels = load_data.load_corpus()
tfidf = gensim.models.TfidfModel(corpus=corpus,dictionary=dic)
corpus_tfidf = [tfidf[doc] for doc in corpus]

matrix = load_data.convert_to_matrix(corpus_tfidf)
train_data,train_label,test_data,test_label = load_data.get_train_test(matrix,labels)

lda = LDA(solver='svd',store_covariance=True)
lda.fit(train_data,train_label)
score = lda.score(test_data,test_label)
print(score)