Ejemplo n.º 1
0
    def record_res(cls, filename, taskname, f, n):
        pos_feature, neg_feature = Ngram.build_features(flag=f, number=n)
        shuffle(pos_feature)
        shuffle(neg_feature)
        index = int(((len(pos_feature) + len(neg_feature)) / 2) * 0.2)
        x_train = pos_feature[index:] + neg_feature[index:]
        x_test = pos_feature[:index] + neg_feature[:index]

        BNB_A, BNB_PP, BNB_PR, BNB_PF1, BNB_NP, BNB_NR, BNB_NF1 = Ngram.score(
            BernoulliNB(), x_train, x_test)
        MNB_A, MNB_PP, MNB_PR, MNB_PF1, MNB_NP, MNB_NR, MNB_NF1 = Ngram.score(
            MultinomialNB(), x_train, x_test)
        LR_A, LR_PP, LR_PR, LR_PF1, LR_NP, LR_NR, LR_NF1 = Ngram.score(
            LogisticRegression(), x_train, x_test)
        LSVC_A, LSVC_PP, LSVC_PR, LSVC_PF1, LSVC_NP, LSVC_NR, LSVC_NF1 = Ngram.score(
            LinearSVC(), x_train, x_test)
        NSVC_A, NSVC_PP, NSVC_PR, NSVC_PF1, NSVC_NP, NSVC_NR, NSVC_NF1 = Ngram.score(
            NuSVC(), x_train, x_test)
        SVC_A, SVC_PP, SVC_PR, SVC_PF1, SVC_NP, SVC_NR, SVC_NF1 = Ngram.score(
            SVC(), x_train, x_test)
        # 记录 准确率(Accuracy)、正面精确率(pos_precision),正面召回率(pos_recall)、正面F1值、负面面精确率(neg_precision)、负面召回率(neg_recall),负面F1值
        log.console_out(
            filename, taskname, n, ('BNB', BNB_A, 'pos:', BNB_PP, BNB_PR,
                                    BNB_PF1, 'neg:', BNB_NP, BNB_NR, BNB_NF1),
            ('MNB', MNB_A, 'pos:', MNB_PP, MNB_PR, MNB_PF1, 'neg:', MNB_NP,
             MNB_NR, MNB_NF1), ('LR', LR_A, 'pos:', LR_PP, LR_PR, LR_PF1,
                                'neg:', LR_NP, LR_NR, LR_NF1),
            ('SVC-rbf', SVC_A, 'pos:', SVC_PP, SVC_PR, SVC_PF1, 'neg:', SVC_NP,
             SVC_NR, SVC_NF1), ('LSVC', LSVC_A, 'pos:', LSVC_PP, LSVC_PR,
                                LSVC_PF1, 'neg:', LSVC_NP, LSVC_NR, LSVC_NF1),
            ('NuSVC', NSVC_A, 'pos:', NSVC_PP, NSVC_PR, NSVC_PF1, 'neg:',
             NSVC_NP, NSVC_NR, NSVC_NF1))
Ejemplo n.º 2
0
    def record_res(cls, filename, taskname, f, n):
        pos_feature, neg_feature = Ngram.build_features(flag=f, number=n)
        shuffle(pos_feature)
        shuffle(neg_feature)
        index = int(((len(pos_feature) + len(neg_feature)) / 2) * 0.2)
        x_train = pos_feature[index:] + neg_feature[index:]
        x_test = pos_feature[:index] + neg_feature[:index]

        SVC_A, SVC_PP, SVC_PR, SVC_PF1, SVC_NP, SVC_NR, SVC_NF1 = Ngram.score(SVC(gamma=0.015), x_train, x_test)
        # 记录 准确率(Accuracy)、正面精确率(pos_precision),正面召回率(pos_recall)、正面F1值、负面面精确率(neg_precision)、负面召回率(neg_recall),负面F1值
        log.console_out(filename, taskname, n,
                        ('SVC-rbf', SVC_A, 'pos:', SVC_PP, SVC_PR, SVC_PF1, 'neg:', SVC_NP, SVC_NR, SVC_NF1))
Ejemplo n.º 3
0
    def train_lstm(cls, n_symbols, embedding_weights, x_train, y_train, x_test,
                   y_test, use_word_dim):
        model = Sequential()
        model.add(
            Embedding(output_dim=cls.vocab_dim,
                      input_dim=n_symbols,
                      mask_zero=True,
                      weights=[embedding_weights],
                      input_length=cls.input_length))  # Adding Input Length
        model.add(
            LSTM(units=50,
                 activation='sigmoid',
                 recurrent_activation='hard_sigmoid'))
        model.add(Dropout(0.55))  # 防止过拟合的
        model.add(Dense(1))
        model.add(Activation('sigmoid'))

        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])  # 编译模型
        loss_history = LossHistory()
        # early_stopping = EarlyStopping(monitor='val_loss', patience=3)  # 当经过3轮验证集的loss不再下降时,中断训练
        # print(x_test[:1])
        # print('\n\n')
        # x_test, y_test, x_val, y_val = train_test_split(x_test, y_test, test_size=0.5, shuffle=True)  # 训练模型10%测试集,10%验证集合
        # print(x_test[:1])
        hist = model.fit(x_train,
                         y_train,
                         batch_size=cls.batch_size,
                         epochs=cls.n_epoch,
                         verbose=1,
                         validation_split=0.11111,
                         callbacks=[loss_history])

        print('=============>history ',
              hist.history)  # 在每个epoch后记录训练/测试的loss和正确率
        score = model.evaluate(x_test, y_test,
                               batch_size=cls.batch_size)  # 评估模型

        yaml_string = model.to_yaml()
        with open(LSTM_YML_PATH.format(use_word_dim), 'w') as outfile:
            outfile.write(yaml.dump(yaml_string, default_flow_style=True))
        model.save_weights(LSTM_MODEL_PATH.format(use_word_dim))
        loss_history.loss_plot('epoch')
        print('Test score:', score)
        log.console_out('lstm_w2v.txt', (250, hist.history, score))
Ejemplo n.º 4
0
    def load_file(cls):
        neg = pd.read_excel('corpus/negT.xlsx', header=None, index=None)
        pos = pd.read_excel('corpus/posT.xlsx', header=None, index=None)

        # pos[1] 即excle表格中第二列
        cw = lambda x: cls.text_parse(x)  # 定义分词函数
        pos['words'] = pos[1].apply(cw)
        neg['words'] = neg[1].apply(cw)
        # print(pos)
        log.console_out("log_svm.txt", "pos length =", len(pos), "neg length =", len(neg))
        # use 1 for positive sentiment, 0 for negative
        y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))  # 合并语料
        x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y,
                                                            test_size=0.2)  # 划分训练和测试集合8/2
        # print(x_train, '\n', y_train)
        np.save('svm_data/y_train.npy', y_train)
        np.save('svm_data/y_test.npy', y_test)
        log.console_out("log_svm.txt", "load_file done!")
        return x_train, x_test
Ejemplo n.º 5
0
    def train(cls):
        x_train, x_test = cls.load_file()
        cls.save_train_vecs(x_train, x_test)  # w2v计算词向量
        train_vecs, y_train, test_vecs, y_test = cls.get_data()
        # clf = SVC(kernel='rbf', verbose=True, probability=True)
        clf = BernoulliNB()
        # clf = LinearSVC()
        # clf = LogisticRegression()
        clf.fit(train_vecs, y_train)
        joblib.dump(clf, 'svm_data/svm_model/model.pkl')
        # print(test_vecs)
        predict_y = clf.predict(test_vecs)  # 基于SVM对验证集做出预测,prodict_y 为预测的结果
        test_accuracy = metrics.accuracy_score(y_test, predict_y)  # 验证集上的准确率

        # y_pred = clf.predict(y_test)
        # test_precision = metrics.precision_score(y_test, y_pred, average='weighted')
        # test_recall = metrics.recall_score(y_test, y_pred, average='weighted')
        log.console_out("log_svm.txt", "SVM score = ", clf.score(test_vecs, y_test))  # 记录得分
        log.console_out("log_svm.txt", "test_accuracy = ", test_accuracy)  # 记录准确率
Ejemplo n.º 6
0
    def save_train_vecs(cls, x_train, x_test):
        n_dim = 128
        # Initialize model and build vocab
        comment_w2v = Word2Vec(size=n_dim, min_count=5)
        comment_w2v.build_vocab(x_train)

        # Train the model over train_reviews (this may take several minutes)
        comment_w2v.train(x_train, total_examples=comment_w2v.corpus_count, epochs=comment_w2v.iter)
        train_vecs = np.concatenate([cls.build_wordvector(z, n_dim, comment_w2v) for z in x_train])
        # train_vecs = scale(train_vecs)

        np.save('svm_data/train_vecs.npy', train_vecs)
        log.console_out("log_svm.txt", "save train vecs done!")

        # Train word2vec on test tweets
        comment_w2v.train(x_test, total_examples=comment_w2v.corpus_count, epochs=comment_w2v.iter)
        comment_w2v.save('svm_data/w2v_model/w2v_model.pkl')
        # Build test tweet vectors then scale
        test_vecs = np.concatenate([cls.build_wordvector(z, n_dim, comment_w2v) for z in x_test])
        # test_vecs = scale(test_vecs)
        np.save('svm_data/test_vecs.npy', test_vecs)

        log.console_out("log_svm.txt", "save test vecs done!")
        log.console_out("log_svm.txt", "save_train_vecs function done!")
Ejemplo n.º 7
0
        'uni_bi_tri', 'uni_info_bi_tri'
    ]
    ns = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
    # nss = [11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000]
    # for i in range(6, 7):
    #     filename = 'record4.txt'
    #     if tasks[i] == tasks[6]:  # uni_info_bi_tri
    #         for n in nss:
    #             Ngram.record_res(filename, tasks[i], i, n)
    for i in range(0, 7):
        filename = 'test.txt'
        if tasks[i] == tasks[0]:  # uni
            Ngram.record_res(filename, tasks[i], i, 0)
        elif tasks[i] == tasks[1]:  # bi
            Ngram.record_res(filename, tasks[i], i, 0)
        elif tasks[i] == tasks[2]:  # uni_bi
            Ngram.record_res(filename, tasks[i], i, 0)
        elif tasks[i] == tasks[3]:  # uni_info
            for n in ns:
                Ngram.record_res(filename, tasks[i], i, n)
        elif tasks[i] == tasks[4]:  # uni_info_bi
            for n in ns:
                Ngram.record_res(filename, tasks[i], i, n)
        elif tasks[i] == tasks[5]:  # uni_bi_tri
            Ngram.record_res(filename, tasks[i], i, 0)
        elif tasks[i] == tasks[6]:  # uni_info_bi_tri
            for n in ns:
                Ngram.record_res(filename, tasks[i], i, n)
    log.console_out(filename, "All Tasks Done")
    print(time.strftime('%Y-%m-%d %A %H:%M:%S', time.localtime(time.time())))