Beispiel #1
0
#参考サイト
#https://qiita.com/ground0state/items/155b77f4c07e1a509a14
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.metrics import accuracy_score
from q51_2 import load_data
#X_list:記事見出しのリスト
#Y_list:カテゴリ名のリスト
#def my_logistic_regression(X,Y,model):

lr=pickle.load(open("my_lr.model", 'rb'))

train_feature=load_data("train.feature.txt")
test_feature=load_data("test.feature.txt")
train=load_data("train.txt")
test=load_data("test.txt")
#学習データの取得
#train_df=pd.read_table("test.feature.txt",
#                                   header=None,
#                                   sep="\t",
#                                   encoding="UTF-8")
#X_train=train_df.drop(train_df.columns[[len(train_df.columns)-1]], axis=1)#特徴量の取得
#Y_test=train_df[len(train_df.columns)-1]#ラベルの取得

pred_train=lr.predict(train_feature)
pred_test=lr.predict(test_feature)
#print(pred_train)
#print(Y_train)
Beispiel #2
0
# coding: utf-8

# In[3]:

#https://zenn.dev/yagiyuki/articles/0d6f97028fdd40209b7f
#https://qiita.com/FujiedaTaro/items/5784eda386146f1fd6e7
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from q51_2 import load_data

#データをロード
X_train = load_data("train.feature.txt")
X_valid = load_data("valid.feature.txt")
X_test = load_data("test.feature.txt")
Y_train = load_data("train.txt")["CATEGORY"]
Y_valid = load_data("valid.txt")["CATEGORY"]
Y_test = load_data("test.txt")["CATEGORY"]
hyper_param = np.logspace(-3, 3, num=7)

best_param = 0
best_accuracy = 0

for c in hyper_param:  #10^(-3)~10^3までハイパーパラメータを変更

    lr = LogisticRegression(max_iter=1000, C=c)  #ハイパーパラメータを指定して、インスタンスを作成
    lr.fit(X_train, Y_train)  #重みを学習
    pred_valid = lr.predict(X_valid)
Beispiel #3
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

#参考サイト
#https://qiita.com/ground0state/items/155b77f4c07e1a509a14
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from q51_2 import load_data
import pickle
#X_list:記事見出しのリスト
#Y_list:カテゴリ名のリスト
#def my_logistic_regression(X,Y,model):

lr = pickle.load(open("my_lr.model", 'rb'))
test_feature = load_data("test.feature.txt")
test = load_data("test.txt")
Y_pred = lr.predict(test_feature)

Y_pred_proba = lr.predict_proba(test_feature)  #クラス0~3に属する確率のリスト
print(Y_pred)
print(Y_pred_proba)

# In[ ]:
Beispiel #4
0
#!/usr/bin/env python
# coding: utf-8

# In[2]:

import pandas as pd
from sklearn.linear_model import LogisticRegression
from q51_2 import load_data
import pickle
train_feature = load_data("train.feature.txt")
train = load_data("train.txt")
#print(train_df)
#print(len(train_df.columns))
#print(train_df[len(train_df.columns)-1])
#X_train=train_df[0:len(train_df.columns)-2]#カテゴリ名を取得
#print(X_train)
#print(train_df.iloc[train_df.columns-1:train_df.columns])
#print(train_feature)
#print(train)
lr = LogisticRegression(max_iter=1000)  #インスタンスを作成、デフォルトで収束しなかったため1000とした
lr.fit(train_feature, train["CATEGORY"])  #重みを学習

#モデルをシリアライズして保存
filename = "my_lr.model"
pickle.dump(lr, open(filename, 'wb'))

# In[ ]:
Beispiel #5
0
import pickle
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from q51_2 import load_data
#X_list:記事見出しのリスト
#Y_list:カテゴリ名のリスト
#def my_logistic_regression(X,Y,model):
lr=pickle.load(open("my_lr.model", 'rb'))
#df=pd.read_table("tmp.txt",#評価データにヘッダを付けたもの
#                                   sep="\t",
#                                   encoding="UTF-8")
#print(df.columns)

df=load_data("test.feature.txt")

#クラス名と特徴量を取得
for cl,coef in zip(lr.classes_,lr.coef_):
#    print(len(coef))
#    print(len(df.columns))
#    print(coef)
    sorted_index=coef.argsort()
    print(f"class:{cl}")
    for i in range (0,10):
        print(f"下位{i+1}:"+df.columns[sorted_index[i]]+f",{coef[sorted_index[i]]}")
    print()
    
    for i in range (0,10):
        print(f"上位{i+1}:"+df.columns[sorted_index[-i-1]]+f",{coef[sorted_index[-i-1]]}")    
    print()