def estimate(news_title): # 未知のデータ予測 vec = dictionary.doc2bow(M.isMecab(news_title)) print(vec) pre = list(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0]) print(pre) label_predict = estimator.predict(pre) print (label_predict)
def cleate_dic(): ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society'] ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""} ret=[] noun = "" for n in ARTICLE_NAME: f = codecs.open('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt', 'r') ARTICLE[n]=f.readlines() f.close() for j in ARTICLE[n]: ret += M.isMecab2(j) preprocessed_docs = {} for name in ret: preprocessed = gensim.parsing.preprocess_string(noun) preprocessed_docs[noun] = preprocessed #print name, ":", preprocessed documents = corpora.Dictionary(ret) documents.save_as_text('noun_dic.txt')
def cleate_lda_model(): ARTICLE_NAME = ["Computer", "Entertainment", "Sports", "Science", "Economy", "World", "Politics", "Society"] ARTICLE = { "Computer": "", "Entertainment": "", "Sports": "", "Science": "", "Economy": "", "World": "", "Politics": "", "Society": "", } dictionary = corpora.Dictionary.load_from_text("noun_dic.txt") for n in ARTICLE_NAME: print "\n" + n + " LDA modl cleate..\n" f = codecs.open("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt", "r") ARTICLE[n] = f.readlines() f.close() data_train = [dictionary.doc2bow(M.isMecab(j)) for j in ARTICLE[n]] tfidf_corpus = gensim.corpora.MmCorpus("news_noun_" + n + ".mm") lda = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=30) lda.save("model_" + n + ".lda")
def train(): ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society'] ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""} ret=[] data_train = [[] for row in range(8)] train_num = 0 train_sum = 0 for n in ARTICLE_NAME: data_range = countline('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt') print data_range train_sum += data_range for col in range(data_range): data_train[train_num].append(None) train_num += 1 print "\nDataset : " + str(train_sum) label_train = [None for col in range(train_sum)] article_data = [None for col in range(train_sum)] cate_num =0 label_num =0 global dictionary dictionary = corpora.Dictionary.load_from_text('noun_dic.txt') for n in ARTICLE_NAME: num2 = 0 f = codecs.open('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt', 'r') ARTICLE[n]=f.readlines() f.close() for j in ARTICLE[n]: tmp = dictionary.doc2bow(M.isMecab(j)) data_train[cate_num][num2] = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) label_train[label_num] = n num2 +=1 label_num += 1 cate_num += 1 ar_num = 0 for n in data_train: for d in n: article_data[ar_num]=d ar_num = ar_num+1 print "\ntrain start!" print "please wait..\n" global estimator """ #学習器の最適化 print "学習器最適化テスト開始" tuned_parameters = [{'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150], 'max_features': ['auto', 'sqrt', 'log2', None]}] clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=-1) clf.fit(article_data, label_train) print("ベストパラメタを表示") print(clf.best_estimator_) print("トレーニングデータでCVした時の平均スコア") for params, mean_score, all_scores in clf.grid_scores_: print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params)) y_true, y_pred = label_test_s, clf.predict(data_test_s) print(classification_report(y_true, y_pred)) """ # 学習させる estimator = RandomForestClassifier(n_estimators=train_sum/100) estimator.fit(article_data, label_train) print("\n==== 学習データと予測データが一緒の場合 ====") print(estimator.score(article_data, label_train))
def train(): ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society'] ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""} ret=[] data_train = [[] for row in range(8)] train_num = 0 train_sum = 0 for n in ARTICLE_NAME: data_range = countline('/Users/somatakei/Onedrive/News_Dataset/article'+n+'.txt') print data_range train_sum += data_range for col in range(data_range): data_train[train_num].append(None) train_num += 1 print "\nDataset : " + str(train_sum) label_train = [None for col in range(train_sum)] article_data = [None for col in range(train_sum)] cate_num =0 label_num =0 global dictionary dictionary = corpora.Dictionary.load_from_text('noun_dic.txt') for n in ARTICLE_NAME: num2 = 0 f = codecs.open('/Users/somatakei/Onedrive/News_Dataset/article'+n+'.txt', 'r') ARTICLE[n]=f.readlines() f.close() for j in ARTICLE[n]: tmp = dictionary.doc2bow(M.isMecab(j)) data_train[cate_num][num2] = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) label_train[label_num] = n num2 +=1 label_num += 1 cate_num += 1 ar_num = 0 for n in data_train: for d in n: article_data[ar_num]=d ar_num = ar_num+1 #dicをBoGの形で配列へ挿入 print "---Bag of Words Corpus---" bow_docs = {} bow_docs_all_zeros = {} for name in names: sparse = dictionary.doc2bow(preprocessed_docs[name]) bow_docs[name] = sparse dense = vec2dense(sparse, num_terms=len(dct)) print name, ":", dense bow_docs_all_zeros[name] = all(d == 0 for d in dense) print "\nall zeros...\n", [name for name in bow_docs_all_zeros if bow_docs_all_zeros[name]] #LSIモデリング print "\nlsi modeling.." names = dictionary.keys() lsi_docs = {} num_topics = 2 lsi_model = gensim.models.LsiModel(bow_docs.values(),id2word=dictionary,num_topics=num_topics) for name in names: vec = data_train[name] sparse = lsi_model[vec] dense = vec2dense(sparse, num_topics) lsi_docs[name] = sparse print name, ":", dense print "\nTopics" print lsi_model.print_topics() # 次元削減後のベクトルを正規化(ベクトルの方向が重要) print "\nunit vectorization.." unit_vecs = {} for name in names: vec = vec2dense(lsi_docs[name], num_topics) norm = sqrt(sum(num ** 2 for num in vec)) unit_vec = [num / norm for num in vec] unit_vecs[name] = unit_vec print name, ":", unit_vec #SVMで学習させる print "\ntrain start!" print "please wait..\n" global estimator estimator = SVC() estimator.fit(article_data, label_train) print("\n==== 学習データと予測データが一緒の場合 ====") print(estimator.score(article_data, label_train))
def train(): ARTICLE_NAME = ["Computer", "Entertainment", "Sports", "Science", "Economy", "World", "Politics", "Society"] ARTICLE = { "Computer": "", "Entertainment": "", "Sports": "", "Science": "", "Economy": "", "World": "", "Politics": "", "Society": "", } ret = [] data_train = [[] for row in range(8)] train_num = 0 train_sum = 0 for n in ARTICLE_NAME: data_range = countline("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt") print data_range train_sum += data_range for col in range(data_range): data_train[train_num].append(None) train_num += 1 print "\nDataset : " + str(train_sum) label_train = [None for col in range(train_sum)] article_data = [None for col in range(train_sum)] cate_num = 0 label_num = 0 global dictionary dictionary = corpora.Dictionary.load_from_text("noun_dic.txt") for n in ARTICLE_NAME: num2 = 0 f = codecs.open("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt", "r") ARTICLE[n] = f.readlines() f.close() for j in ARTICLE[n]: tmp = dictionary.doc2bow(M.isMecab(j)) data_train[cate_num][num2] = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) label_train[label_num] = n num2 += 1 label_num += 1 cate_num += 1 ar_num = 0 for n in data_train: for d in n: article_data[ar_num] = d ar_num = ar_num + 1 print "\ntrain start!" print "please wait..\n" global estimator estimator = RandomForestClassifier() # 学習させる estimator.fit(article_data, label_train) print ("\n==== 学習データと予測データが一緒の場合 ====") print (estimator.score(article_data, label_train))
ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society'] ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""} ret=[] data_train = [[] for row in range(8)] train_num = 0 train_sum = 0 for n in ARTICLE_NAME: data_range = countline('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt') train_sum += data_range for col in range(data_range): data_train[train_num].append(None) train_num += 1 label_train = [None for col in range(train_sum)] article_data = [None for col in range(train_sum)] num =0 num3 =0 dictionary = corpora.Dictionary.load_from_text('test_dic4.txt') M.pp(M.isMecab("香川ループ弾「衝撃」と賞賛")) test=[[]for col in range(6)] test[0].append("a") test[0].append("b") test[0].append("c") test[1].append("d") test[1].append("e") print (test)
"Sports": "", "Science": "", "Economy": "", "World": "", "Politics": "", "Society": "", } dictionary = corpora.Dictionary.load_from_text("noun_dic.txt") for n in ARTICLE_NAME: print "\n" + n + " LDA modl cleate..\n" f = codecs.open("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt", "r") ARTICLE[n] = f.readlines() f.close() data_train = [dictionary.doc2bow(M.isMecab(j)) for j in ARTICLE[n]] tfidf_corpus = gensim.corpora.MmCorpus("news_noun_" + n + ".mm") lda = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=30) lda.save("model_" + n + ".lda") if __name__ == "__main__": print "cleate_dic.." # CD.cleate_dic() cleate_lda_model() lda = models.LdaModel.load("model_Sports.lda") lda2 = models.LdaModel.load("model_Computer.lda") for n in range(30): M.pp(lda.print_topics(n + 1)) for n in range(30): M.pp(lda2.print_topics(n + 1))
from sklearn.ensemble import RandomForestClassifier import simplejson as json import Test_MeCab as M text = "iPhone 従来の「カケホーダイプラン」は、国内音声通話が回数の制限なしで無料となるもので、\ 月額料金は2年定期契約で2700円だ。それに対してカケホーダイライトプランは月額料金が1700円と1000円分安くなる。\ その条件として5分以内の通話であれば、何回かけても無料だが、5分を超えた場合は30秒あたり20円がかかることとなった。\ ただし「ファミリー割引」に加入していれば家族間通話が無料となるので、家族で申込みする場合は利用したい。\ ■データ定額プランは5GBからパケットパックとの組み合わせは、1か月のパケット上限が5GBまでとなる「データMパック」から用意される。\ こちらは月額料金が5000円。インターネット接続サービスの「spモード」300円に加入すれば、利用料金は月額7000円になる。\ ここに携帯電話の購入代金がかかってくる。たとえば『iPhone 6s』16GBモデルをMNPで購入すると、実質負担金は月額432円なので、\ トータル7432円/月で新型iPhoneを使うことができる。「データ通信よりも通話がメインだ」というユーザーなら、\ 「カケホーダイプラン」2700円/月に容量2GBの「データSパック」3500円/月を組み合わせる方法もある。\ こちらで『iPhone 6s』16GBモデルをMNPで購入すると、月額6932円となる。" text2 ="この辺にぃ、おいしいラーメン屋の屋台があるみたいなんですが、行きませんかー?カケホーダイ,iPhone" ret = M.isMecab(text) ret2 = M.isMecab(text2) dictionary = corpora.Dictionary.load_from_text('test_dic.txt') vec = dictionary.doc2bow(ret) print(vec) tmp = dictionary.doc2bow(ret) dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) print(dense) vec = dictionary.doc2bow(ret2) print(vec) tmp = dictionary.doc2bow(ret2) dense2 = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])