def predict_function(): x_list = [] predict_doc = joblib.load('logreg.pkl') y_train, x_train = get_feature() for line in x_train: x_list.append(dict(Counter(line))) X = DictVectorizer().fit_transform(x_list) pred = predict_doc.predict(X) prob = predict_doc.predict_proba(X) return pred, y_train, prob
def log_regression(): x_list = [] logreg = LogisticRegression() y_train, x_train = get_feature() for line in x_train: x_list.append(dict(Counter(line))) word_vec = DictVectorizer() X = word_vec.fit_transform(x_list) logreg.fit(X, y_train) joblib.dump(logreg, 'logreg.pkl') joblib.dump(word_vec, "word_vec.pkl")
def log_regression(): x_list = [] logreg = LogisticRegression() y_train, x_train = get_feature() for line in x_train: x_list.append(dict(Counter(line))) word_vec = DictVectorizer() X = word_vec.fit_transform(x_list) logreg.fit(X, y_train) joblib.dump(logreg, 'logreg.pkl') joblib.dump(word_vec,"word_vec.pkl")
def predict_function(): x_list = [] line_list = [] line_dict = {} predict_doc = joblib.load('logreg.pkl') feature_doc = joblib.load("word_vec.pkl") y_train, x_train = get_feature() line = "bad bad good good" line_list = line.split() for line in x_train: for key in line: line_dict[key] = 0 line_dict.update(dict(Counter(line_list))) for a in sorted(line_dict.items(), key=lambda x: x[1]): print(a) x_list.append(line_dict) print(x_list) exit() X = DictVectorizer().fit_transform(x_list) pred = predict_doc.predict(X) prob = predict_doc.predict_proba(X) for pred, prob in zip(pred, prob): print(pred, prob)
def predict_function(): x_list = [] line_list = [] line_dict = {} predict_doc = joblib.load('logreg.pkl') feature_doc = joblib.load("word_vec.pkl") y_train, x_train = get_feature() line = "bad bad good good" line_list = line.split() for line in x_train: for key in line: line_dict[key] = 0 line_dict.update(dict(Counter(line_list))) for a in sorted(line_dict.items(), key = lambda x:x[1]): print(a) x_list.append(line_dict) print(x_list) exit() X = DictVectorizer().fit_transform(x_list) pred = predict_doc.predict(X) prob = predict_doc.predict_proba(X) for pred, prob in zip(pred,prob): print(pred, prob)
def main(): # モデルのロード model = joblib.load('model') vocab = joblib.load('vocab') # sentence = input('Sentence -> ').strip()[3:] f = open('sentiment.txt', encoding='utf8') for _ in range(124): next(f) sentence = f.readline() print('input\n', sentence) sentence = sentence.strip()[3:] feature = get_feature(sentence) vectorizer = TfidfVectorizer(vocabulary=vocab) feature_vec = vectorizer.fit_transform([feature]).toarray() predict = model.predict(feature_vec)[0] probability = model.predict_proba(feature_vec)[0] if predict == -1: print(f'label : negative ({probability[1] * 100:.3f}%)') else: print(f'label : positive ({probability[1] * 100:.3f}%)')
from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction import DictVectorizer from sklearn.externals import joblib from knock72 import get_feature clf = LogisticRegression() dic2vec = DictVectorizer() features = list() y = list() for line in open('sentiment.txt'): spl = line.rstrip('\n').split() label, sent = spl[0], ' '.join(spl[1:]) features.append(get_feature(sent)) y.append(int(label)) x = dic2vec.fit_transform(features) clf.fit(x, y) joblib.dump(clf, 'knock73.model')
for key, value in dict(Counter(feature[test])).items(): if key in feature_dict: feature_dict[key] = value pred_list.append(feature_dict) preds.append(logreg.predict(word_vec.fit_transform(pred_list))) return preds, test_numbers def get_score(preds, target, test_numbers): all_accuracy = [] all_precision = [] all_recall = [] all_f_values = [] for tests, pred in zip(test_numbers, preds): answers = [] for test in tests: answers.append(target[test]) all_accuracy.append(accuracy_score(answers, pred)) all_precision.append(precision_score(answers, pred)) all_recall.append(recall_score(answers, pred)) all_f_values.append(f1_score(answers, pred)) return np.array(all_accuracy).mean(), np.array(all_precision).mean( ), np.array(all_recall).mean(), np.array(all_f_values).mean() if __name__ == '__main__': target, feature = get_feature() preds, test_numbers = make_kfold(target, feature) print("正解率:{}\t適合率:{}\t再現率:{}\tF1スコア:{}".format( *get_score(preds, target, test_numbers)))
from sklearn.feature_extraction import DictVectorizer from sklearn.externals import joblib from knock72 import get_feature clf = joblib.load('knock73.model') dic2vec = DictVectorizer() features = list() y = list() sents = list() for line in open('sentiment.txt'): spl = line.rstrip('\n').split() label, sent = spl[0], ' '.join(spl[1:]) sents.append(sent) features.append(get_feature(sent)) y.append(int(label)) x = dic2vec.fit_transform(features) pred = clf.predict(x) pred_proba = clf.predict_proba(x) for pre, t, (prob_n, prob_p) in zip(pred, y, pred_proba): prob = prob_n if pre == -1 else prob_p print('{}\t{}\t{}'.format(t, pre, prob))
for test in tests: feature_dict = defaultdict(int) for f in word_vec.get_feature_names(): feature_dict[f] = 0 for key, value in dict(Counter(feature[test])).items(): if key in feature_dict: feature_dict[key] = value pred_list.append(feature_dict) preds.append(logreg.predict(word_vec.fit_transform(pred_list))) return preds, test_numbers def get_score(preds, target, test_numbers): all_accuracy = [] all_precision = [] all_recall = [] all_f_values = [] for tests, pred in zip(test_numbers, preds): answers = [] for test in tests: answers.append(target[test]) all_accuracy.append(accuracy_score(answers, pred)) all_precision.append(precision_score(answers, pred)) all_recall.append(recall_score(answers, pred)) all_f_values.append(f1_score(answers, pred)) return np.array(all_accuracy).mean(), np.array(all_precision).mean(), np.array(all_recall).mean(), np.array(all_f_values).mean() if __name__ == '__main__': target, feature = get_feature() preds, test_numbers = make_kfold(target, feature) print("正解率:{}\t適合率:{}\t再現率:{}\tF1スコア:{}".format(*get_score(preds, target, test_numbers)))
if __name__ == '__main__': # モデルのロード with open('model', 'rb') as f: model = pickle.load(f) # stopwordのリストをロード with open('stopwords', 'rb') as f: stopwords = pickle.load(f) # vocabサイズをロード with open('vocab_size', 'rb') as f: vocab = pickle.load(f) # 入力を受け取り素性抽出 sentence = input('Please input sentence -> ').strip()[3:] sent_feature = get_feature(sentence, stopwords) # 素性をベクトルに変換 voctorizer = CountVectorizer(vocabulary=vocab) feature_vec = voctorizer.fit_transform([sent_feature]).toarray() # 予測 label_predict = model.predict(feature_vec) prob = model.predict_proba(feature_vec) print(f'label : {label_predict[0]}') print(f'prob(+1): {round(prob[0][0] * 100, 3)}%') print(f'prob(-1): {round(prob[0][1] * 100, 3)}%')
from knock72 import get_feature from sklearn.feature_extraction.text import CountVectorizer import pickle with open("./chapter08/model", "rb") as f: model = pickle.load(f) with open("./chapter08/vocab_size", "rb") as f: vocab_size = pickle.load(f) sentence = "This is a very good wonderful fantastic movie." sentence_feature = get_feature(sentence) vectorizer = CountVectorizer(vocabulary=vocab_size) feature_vec = vectorizer.fit_transform([sentence_feature]).toarray() label_predict = model.predict(feature_vec) prob = model.predict_proba(feature_vec) print(label_predict, prob)