def record_res(cls, filename, taskname, f, n): pos_feature, neg_feature = Ngram.build_features(flag=f, number=n) shuffle(pos_feature) shuffle(neg_feature) index = int(((len(pos_feature) + len(neg_feature)) / 2) * 0.2) x_train = pos_feature[index:] + neg_feature[index:] x_test = pos_feature[:index] + neg_feature[:index] BNB_A, BNB_PP, BNB_PR, BNB_PF1, BNB_NP, BNB_NR, BNB_NF1 = Ngram.score( BernoulliNB(), x_train, x_test) MNB_A, MNB_PP, MNB_PR, MNB_PF1, MNB_NP, MNB_NR, MNB_NF1 = Ngram.score( MultinomialNB(), x_train, x_test) LR_A, LR_PP, LR_PR, LR_PF1, LR_NP, LR_NR, LR_NF1 = Ngram.score( LogisticRegression(), x_train, x_test) LSVC_A, LSVC_PP, LSVC_PR, LSVC_PF1, LSVC_NP, LSVC_NR, LSVC_NF1 = Ngram.score( LinearSVC(), x_train, x_test) NSVC_A, NSVC_PP, NSVC_PR, NSVC_PF1, NSVC_NP, NSVC_NR, NSVC_NF1 = Ngram.score( NuSVC(), x_train, x_test) SVC_A, SVC_PP, SVC_PR, SVC_PF1, SVC_NP, SVC_NR, SVC_NF1 = Ngram.score( SVC(), x_train, x_test) # 记录 准确率(Accuracy)、正面精确率(pos_precision),正面召回率(pos_recall)、正面F1值、负面面精确率(neg_precision)、负面召回率(neg_recall),负面F1值 log.console_out( filename, taskname, n, ('BNB', BNB_A, 'pos:', BNB_PP, BNB_PR, BNB_PF1, 'neg:', BNB_NP, BNB_NR, BNB_NF1), ('MNB', MNB_A, 'pos:', MNB_PP, MNB_PR, MNB_PF1, 'neg:', MNB_NP, MNB_NR, MNB_NF1), ('LR', LR_A, 'pos:', LR_PP, LR_PR, LR_PF1, 'neg:', LR_NP, LR_NR, LR_NF1), ('SVC-rbf', SVC_A, 'pos:', SVC_PP, SVC_PR, SVC_PF1, 'neg:', SVC_NP, SVC_NR, SVC_NF1), ('LSVC', LSVC_A, 'pos:', LSVC_PP, LSVC_PR, LSVC_PF1, 'neg:', LSVC_NP, LSVC_NR, LSVC_NF1), ('NuSVC', NSVC_A, 'pos:', NSVC_PP, NSVC_PR, NSVC_PF1, 'neg:', NSVC_NP, NSVC_NR, NSVC_NF1))
def record_res(cls, filename, taskname, f, n): pos_feature, neg_feature = Ngram.build_features(flag=f, number=n) shuffle(pos_feature) shuffle(neg_feature) index = int(((len(pos_feature) + len(neg_feature)) / 2) * 0.2) x_train = pos_feature[index:] + neg_feature[index:] x_test = pos_feature[:index] + neg_feature[:index] SVC_A, SVC_PP, SVC_PR, SVC_PF1, SVC_NP, SVC_NR, SVC_NF1 = Ngram.score(SVC(gamma=0.015), x_train, x_test) # 记录 准确率(Accuracy)、正面精确率(pos_precision),正面召回率(pos_recall)、正面F1值、负面面精确率(neg_precision)、负面召回率(neg_recall),负面F1值 log.console_out(filename, taskname, n, ('SVC-rbf', SVC_A, 'pos:', SVC_PP, SVC_PR, SVC_PF1, 'neg:', SVC_NP, SVC_NR, SVC_NF1))
def train_lstm(cls, n_symbols, embedding_weights, x_train, y_train, x_test, y_test, use_word_dim): model = Sequential() model.add( Embedding(output_dim=cls.vocab_dim, input_dim=n_symbols, mask_zero=True, weights=[embedding_weights], input_length=cls.input_length)) # Adding Input Length model.add( LSTM(units=50, activation='sigmoid', recurrent_activation='hard_sigmoid')) model.add(Dropout(0.55)) # 防止过拟合的 model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # 编译模型 loss_history = LossHistory() # early_stopping = EarlyStopping(monitor='val_loss', patience=3) # 当经过3轮验证集的loss不再下降时,中断训练 # print(x_test[:1]) # print('\n\n') # x_test, y_test, x_val, y_val = train_test_split(x_test, y_test, test_size=0.5, shuffle=True) # 训练模型10%测试集,10%验证集合 # print(x_test[:1]) hist = model.fit(x_train, y_train, batch_size=cls.batch_size, epochs=cls.n_epoch, verbose=1, validation_split=0.11111, callbacks=[loss_history]) print('=============>history ', hist.history) # 在每个epoch后记录训练/测试的loss和正确率 score = model.evaluate(x_test, y_test, batch_size=cls.batch_size) # 评估模型 yaml_string = model.to_yaml() with open(LSTM_YML_PATH.format(use_word_dim), 'w') as outfile: outfile.write(yaml.dump(yaml_string, default_flow_style=True)) model.save_weights(LSTM_MODEL_PATH.format(use_word_dim)) loss_history.loss_plot('epoch') print('Test score:', score) log.console_out('lstm_w2v.txt', (250, hist.history, score))
def load_file(cls): neg = pd.read_excel('corpus/negT.xlsx', header=None, index=None) pos = pd.read_excel('corpus/posT.xlsx', header=None, index=None) # pos[1] 即excle表格中第二列 cw = lambda x: cls.text_parse(x) # 定义分词函数 pos['words'] = pos[1].apply(cw) neg['words'] = neg[1].apply(cw) # print(pos) log.console_out("log_svm.txt", "pos length =", len(pos), "neg length =", len(neg)) # use 1 for positive sentiment, 0 for negative y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg)))) # 合并语料 x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2) # 划分训练和测试集合8/2 # print(x_train, '\n', y_train) np.save('svm_data/y_train.npy', y_train) np.save('svm_data/y_test.npy', y_test) log.console_out("log_svm.txt", "load_file done!") return x_train, x_test
def train(cls): x_train, x_test = cls.load_file() cls.save_train_vecs(x_train, x_test) # w2v计算词向量 train_vecs, y_train, test_vecs, y_test = cls.get_data() # clf = SVC(kernel='rbf', verbose=True, probability=True) clf = BernoulliNB() # clf = LinearSVC() # clf = LogisticRegression() clf.fit(train_vecs, y_train) joblib.dump(clf, 'svm_data/svm_model/model.pkl') # print(test_vecs) predict_y = clf.predict(test_vecs) # 基于SVM对验证集做出预测,prodict_y 为预测的结果 test_accuracy = metrics.accuracy_score(y_test, predict_y) # 验证集上的准确率 # y_pred = clf.predict(y_test) # test_precision = metrics.precision_score(y_test, y_pred, average='weighted') # test_recall = metrics.recall_score(y_test, y_pred, average='weighted') log.console_out("log_svm.txt", "SVM score = ", clf.score(test_vecs, y_test)) # 记录得分 log.console_out("log_svm.txt", "test_accuracy = ", test_accuracy) # 记录准确率
def save_train_vecs(cls, x_train, x_test): n_dim = 128 # Initialize model and build vocab comment_w2v = Word2Vec(size=n_dim, min_count=5) comment_w2v.build_vocab(x_train) # Train the model over train_reviews (this may take several minutes) comment_w2v.train(x_train, total_examples=comment_w2v.corpus_count, epochs=comment_w2v.iter) train_vecs = np.concatenate([cls.build_wordvector(z, n_dim, comment_w2v) for z in x_train]) # train_vecs = scale(train_vecs) np.save('svm_data/train_vecs.npy', train_vecs) log.console_out("log_svm.txt", "save train vecs done!") # Train word2vec on test tweets comment_w2v.train(x_test, total_examples=comment_w2v.corpus_count, epochs=comment_w2v.iter) comment_w2v.save('svm_data/w2v_model/w2v_model.pkl') # Build test tweet vectors then scale test_vecs = np.concatenate([cls.build_wordvector(z, n_dim, comment_w2v) for z in x_test]) # test_vecs = scale(test_vecs) np.save('svm_data/test_vecs.npy', test_vecs) log.console_out("log_svm.txt", "save test vecs done!") log.console_out("log_svm.txt", "save_train_vecs function done!")
'uni_bi_tri', 'uni_info_bi_tri' ] ns = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000] # nss = [11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000] # for i in range(6, 7): # filename = 'record4.txt' # if tasks[i] == tasks[6]: # uni_info_bi_tri # for n in nss: # Ngram.record_res(filename, tasks[i], i, n) for i in range(0, 7): filename = 'test.txt' if tasks[i] == tasks[0]: # uni Ngram.record_res(filename, tasks[i], i, 0) elif tasks[i] == tasks[1]: # bi Ngram.record_res(filename, tasks[i], i, 0) elif tasks[i] == tasks[2]: # uni_bi Ngram.record_res(filename, tasks[i], i, 0) elif tasks[i] == tasks[3]: # uni_info for n in ns: Ngram.record_res(filename, tasks[i], i, n) elif tasks[i] == tasks[4]: # uni_info_bi for n in ns: Ngram.record_res(filename, tasks[i], i, n) elif tasks[i] == tasks[5]: # uni_bi_tri Ngram.record_res(filename, tasks[i], i, 0) elif tasks[i] == tasks[6]: # uni_info_bi_tri for n in ns: Ngram.record_res(filename, tasks[i], i, n) log.console_out(filename, "All Tasks Done") print(time.strftime('%Y-%m-%d %A %H:%M:%S', time.localtime(time.time())))