Example #1
0
def NB_Accuracy(features_train, labels_train, features_test, labels_test):
    """ 计算分类器的准确率"""
    ### 导入sklearn模块的GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### 创建分类器
    clf = GaussianNB()

    ### 训练分类器
    X = features_train
    Y = labels_train
    clf.fit(X, Y)

    ### 用训练好的分类器去预测测试集的标签值
    pred = clf.predict(features_test)

    print("预测值:")
    print(pred)

    ### 计算并返回在测试集上的准确率
    from sklearn.metrics import accuracy_score
    y_pred = pred
    y_true = labels_test
    accuracy_score(y_true, y_pred)

    cm = confusion_matrix(y_true, y_pred)
    import matplotlib.pyplot as plt

    class_names = [0, 1]
    plt.figure()
    plot_confusion_matrix(cm, classes=class_names, title='Confusion matrix')
    plt.show()

    return accuracy_score(y_true, y_pred, normalize=False)
    #return plt
Example #2
0
def plot(class_num, y_test, ans_best):
    if class_num == 2:
        classes = ['0', '1']
    elif class_num == 3:
        classes = ['0', '1', '2']
    elif class_num == 4:
        classes = ['0', '1', '2', '3']

    np.set_printoptions(precision=2)
    plot_confusion_matrix(y_test,
                          ans_best,
                          classes=classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues)
    plt.show()
Example #3
0
            predictions = model_rf.predict(X_test2)
            results_rf[i].iloc[j] = np.sum(predictions == y_test2)

    return results_rf.mean(axis=1)


#results_rf = determine_depth_tree(X_train,y_train)

# Results of the first model

model_rf = RandomForestClassifier(n_estimators=500,
                                  max_depth=1000,
                                  class_weight="balanced")
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test,
                      y_pred,
                      classes=["Non-seizure", "Seizure"],
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(y_test,
                      y_pred,
                      classes=["Non-seizure", "Seizure"],
                      normalize=True,
                      title='Normalized confusion matrix')

plt.show()
        

    pipeline.fit(train_X, train_y)
    y_true = test_y
    y_pred = pipeline.predict(test_X)

    weighted_f1_score = f1_score(y_true, y_pred, average='weighted')
    f1_scores = f1_score(y_true, y_pred,
                         average=None,
                         labels=unique_labels)
    class_f1_scores = dict(zip(unique_labels, f1_scores))

    evaluation = {
        'weighted_f1_score': weighted_f1_score,
        'class_f1_scores': class_f1_scores
    }

    filename = 'evaluation-{}.json'.format(current_git_sha())
    with open(filename, 'w') as f:
        json.dump(evaluation, f,
                  sort_keys=True,
                  indent=4,
                  separators=(',', ': '))

    compare('evaluation-best.json', filename)

    if args.confusion_matrix:
        cm = confusion_matrix(y_true, y_pred, labels=unique_labels)
        plot_confusion_matrix(cm, unique_labels, normalize=True)
        plt.show()
Example #5
0
def main(args):
    train_data_count = pd.read_csv('dataset/training/train.csv')
    val_data_count = pd.read_csv('dataset/training/valid.csv')
    test_data_count = pd.read_csv('dataset/training/test.csv')

    print('The count for each label in the training set is:')
    print(train_data_count['label'].value_counts())
    print()
    print('The count for each label in the validation set is:')
    print(val_data_count['label'].value_counts())
    print()
    print('The count for each label in the testing set is:')
    print(test_data_count['label'].value_counts())
    print()

    if args.tokenizer == 'crazy':
        print('The tokenizer is: CrazyTokenizer \n')
        tokenizer = CrazyTokenizer().tokenize
    if args.tokenizer == 'nltk':
        print('The tokenizer is: NLTK \n')
        tokenizer = sent_tokenize
    else:
        print('The tokenizer is: spacy \n')
        tokenizer = 'spacy'

    print('The model used is:', args.model, '\n')

    text = data.Field(sequential=True, lower=True, include_lengths=True)
    labels = data.Field(sequential=False, use_vocab=False)

    train_data, val_data, test_data = data.TabularDataset.splits(
        path='./dataset', train='./training/train.csv',
        validation='./training/valid.csv', test='./training/test.csv', format='csv',
        skip_header=True, fields=[('text', text), ('label', labels)])

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train_data, val_data, test_data), batch_sizes=(args.batch_size, args.batch_size, args.batch_size),
        sort_key=lambda x: len(x.text), device=None, sort_within_batch=True, repeat=False)

    text.build_vocab(train_data, val_data, test_data)

    text.vocab.load_vectors(torchtext.vocab.GloVe(name='6B', dim=100))
    vocab = text.vocab

    print('Shape of Vocab:', text.vocab.vectors.shape, '\n')

    lr = args.lr
    num_classes = args.num_class
    epochs = args.epochs
    model_type = args.model
    emb_dim = args.emb_dim
    rnn_hidden_dim = args.rnn_hidden_dim
    num_filt = args.num_filt

    if model_type == 'cnn':
        net = CNN(emb_dim, vocab, num_filt, [3, 4], num_classes)
    elif model_type == 'rnn':
        net = RNN(emb_dim, vocab, rnn_hidden_dim, num_classes)
    elif model_type == 'gru':
        net = GRU(emb_dim, vocab, rnn_hidden_dim, num_classes)
    elif model_type == 'lstm':
        net = LSTM(emb_dim, vocab, rnn_hidden_dim, num_classes)
    else:
        net = Baseline(emb_dim, vocab, num_classes)

    # Use CUDA model if available:
    net.to(device)

    # Setup using Adam optimizer
    optimizer = optim.Adam(net.parameters(), lr=lr)
    loss_fcn = nn.CrossEntropyLoss()

    # Plotting data
    plot_epoch = [i for i in range(1, args.epochs + 1)]
    plot_train_loss, plot_train_acc, plot_valid_loss, plot_valid_acc = [], [], [], []

    print('---------- TRAINING LOOP ---------- \n')

    # Begin Training Loop
    for epoch in range(epochs):
        cum_loss = 0
        for (i, batch) in enumerate(train_iter, 1):
            # Setting network to training mode
            net.train()
            optimizer.zero_grad()

            # Getting data for current batch
            batch_input, batch_length = batch.text
            batch_input = batch_input.to(device)
            batch_label = nn.functional.one_hot(batch.label).float()

            # Forward step to get prediction
            if model_type == 'rnn' or model_type == 'gru' or model_type == 'lstm':
                output = net(batch_input, batch_length)
            else:
                output = net(batch_input)

            # Loss calculation and parameter update
            loss = loss_fcn(output, many_cold(batch_label).long().to(device))
            cum_loss += loss
            loss.backward()
            optimizer.step()

        # Stats for plotting
        net.eval()
        train_loss, train_acc = eval_acc(net, train_iter, loss_fcn, model_type, 'train')
        valid_loss, valid_acc = eval_acc(net, val_iter, loss_fcn, model_type, 'val')

        plot_train_loss.append(train_loss / (train_data_count.shape[0]))
        plot_train_acc.append(train_acc)
        plot_valid_loss.append(valid_loss / (val_data_count.shape[0]))
        plot_valid_acc.append(valid_acc)

        # Print progress per batch to monitor progress
        print('[%d] Train Loss: %.3f  Valid Loss: %.3f Train Acc: %.3f Valid Acc: %3f ' % (epoch + 1,
                                                                                           cum_loss / (epoch + 1),
                                                                                           valid_loss / (epoch + 1),
                                                                                           train_acc, valid_acc))
    # Final Results
    test_loss, test_acc = eval_acc(net, test_iter, loss_fcn, model_type, 'test')
    val_loss, val_acc = eval_acc(net, val_iter, loss_fcn, model_type, 'valid')
    train_loss, train_acc = eval_acc(net, train_iter, loss_fcn, model_type, 'train')

    print()
    print('---------- FINAL RESULTS ----------')
    print()
    print('Final Training Loss: ' + str(train_loss / (epoch + 1)) + ', Final Training Acc: ' + str(train_acc))
    print('Final Validation Loss: ' + str(val_loss / (epoch + 1)) + ', Final Validation Acc: ' + str(val_acc))
    print('Final Test Loss: ' + str(test_loss / (epoch + 1)) + ', Final Test Acc: ' + str(test_acc))

    '''

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train_data, val_data, test_data), batch_sizes=(len(train_data), len(val_data), len(test_data)),
        sort_key=lambda x: len(x.text), device=None, sort_within_batch=True, repeat=False)


    for (i, batch) in enumerate(train_iter, 1):
        # Setting network to eval mode
        net.eval()

        # Getting data for current batch
        batch_input, batch_length = batch.text
        batch_input = batch_input.to(device)
        batch_label = nn.functional.one_hot(batch.label).float()

        # Forward step to get prediction
        if model_type == 'rnn' or model_type == 'gru':
            output = net(batch_input, batch_length)
        else:
            output = net(batch_input)

    outputs = many_cold(output)
    batch_label = many_cold(batch_label)
    print("Below is Confusion Matrix for Training Set")
    print(confusion_matrix(batch_label, outputs))
    '''

    batch_label = torch.empty(0).to(device).float()
    output = torch.empty(0).to(device)

    for (i, batch) in enumerate(val_iter, 1):

        # Getting data for current batch
        batch_input, batch_length = batch.text
        batch_input = batch_input.to(device)
        batch_label = torch.cat((batch_label, batch.label.to(device).float()))

        # Forward step to get prediction
        if model_type == 'rnn' or model_type == 'gru' or model_type == 'lstm':
            output = torch.cat((output, net(batch_input, batch_length)))
        else:
            output = torch.cat((output, net(batch_input)))

    outputs = many_cold(output)

    # Print number of trainable parameters in the model
    print()
    print('The number of trainable parameters in the model is:')
    print(sum(p.numel() for p in net.parameters() if p.requires_grad))
    # https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7

    print()
    print("Below is Confusion Matrix for Validation Set")
    print(confusion_matrix(batch_label.cpu(), outputs.cpu()))

    batch_label = torch.empty(0).to(device).float()
    output = torch.empty(0).to(device)

    for (i, batch) in enumerate(test_iter, 1):

        # Getting data for current batch
        batch_input, batch_length = batch.text
        batch_input = batch_input.to(device)
        batch_label = torch.cat((batch_label, batch.label.to(device).float()))

        # Forward step to get prediction
        if model_type == 'rnn' or model_type == 'gru' or model_type == 'lstm':
            output = torch.cat((output, net(batch_input, batch_length)))
        else:
            output = torch.cat((output, net(batch_input)))

    outputs = many_cold(output)

    # Saving model
    if args.save:
        torch.save(net, 'model_' + model_type + '.pt')

    # Confusion Matrix
    print()
    print("Below is Confusion Matrix for Test Set")
    plot_confusion_matrix(batch_label.cpu(), outputs.cpu(), classes=subreddits)
    plt.savefig('model_' + model_type + '_confusion.png')
    plt.show()

    # Plot Losses and Accuracy
    plt.figure()
    plt.plot(plot_epoch, plot_train_loss, label='Training Loss')
    plt.plot(plot_epoch, plot_valid_loss, label='Validation Loss')
    plt.title('Losses as Function of Epoch (' + args.model + ')')
    plt.ylabel("Loss")
    plt.xlabel("Epoch")
    plt.legend()
    plt.savefig('model_' + model_type + '_loss.png')
    plt.show()

    # Plot accuracy
    plt.figure()
    plt.plot(plot_epoch, plot_train_acc, label='Training Accuracy')
    plt.plot(plot_epoch, plot_valid_acc, label='Validation Accuracy')
    plt.title('Accuracy as Function of Epoch (' + args.model + ')')
    plt.ylim(0, 1.01)
    plt.ylabel("Accuracy")
    plt.xlabel("Epoch")
    plt.legend()
    plt.savefig('model_' + model_type + '_accuracy.png')
    plt.show()
Example #6
0
def nn(X_train, X_test, y_train, y_test, class_num, input_dim, epochs,
       batch_size, optimizer, loss):
    # Neural network
    model = Sequential()
    model.add(Dense(32, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.2))
    # model.add(Dense(4, activation='relu'))
    # model.add(Dropout(0.3))
    model.add(Dense(class_num, activation='softmax'))

    if optimizer == "sgd":
        sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
        optimizer_using = sgd
    elif optimizer == "adam":
        optimizer_using = "adam"

    if loss == "binary":
        loss_using = 'binary_crossentropy'
    elif loss == "categorical":
        loss_using = 'categorical_crossentropy'

    model.compile(loss=loss_using,
                  optimizer=optimizer_using,
                  metrics=['accuracy'])

    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

    y_pred = model.predict(X_test)
    pred = list()
    for i in range(len(y_pred)):
        pred.append(np.argmax(y_pred[i]))
    #Converting one hot encoded test label to label
    test = list()
    for i in range(len(y_test)):
        test.append(np.argmax(y_test[i]))

    from sklearn.metrics import accuracy_score
    a = accuracy_score(pred, test)
    print("")
    print('Accuracy is:', a * 100)
    print("")
    print("----------------------")

    if class_num == 2:
        classes = ['0', '1']
    else:
        classes = ['0', '1', '2', '3']
    np.set_printoptions(precision=2)
    plot_confusion_matrix(test,
                          pred,
                          classes=classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues)
    plt.show()

    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_test, y_test),
                        epochs=100,
                        batch_size=64)

    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()
Example #7
0
def ml(class_num,
       epochs,
       method,
       source_data,
       twitter_source,
       google_source,
       ig_source,
       judge=True,
       nan=True):
    if judge == True:
        tmp_data = pie(num=11, source_data=source_data, all_option=True)
        final_data = combine(tmp_data, twitter_source, google_source,
                             ig_source)
        input_x, revised_y = data_preprocess(final_data, nan=nan)
        a = pd.to_datetime(final_data['上映日期'])
        cut = a.dt.weekofyear
        test_list = []
        train_list = []
        test = 0
        train = 0
        # print(cut)
        for i in range(len(final_data)):
            if cut[i] % 4 == 0:
                test += 1
                test_list.append(i)
            else:
                train += 1
                train_list.append(i)
        # print(train_list)
        # print(test_list)
        print(train)
        print(test)
        # print("final",final_data.shape)
        train_final_data = final_data
        test_final_data = final_data
        for i in test_list:
            train_final_data = train_final_data.drop(final_data.index[i])
        # print(len(train_final_data))
        train_final_data = train_final_data.reset_index(drop=True)
        # print(train_new_youtube_file_v3_data)
        for i in train_list:
            test_final_data = test_final_data.drop(final_data.index[i])
        # print(len(test_new_youtube_file_v3_data))
        test_final_data = test_final_data.reset_index(drop=True)
        # print(test_new_youtube_file_v3_data)

        X_train, y_train = data_preprocess(train_final_data, nan=nan)
        X_test, y_test = data_preprocess(test_final_data, nan=nan)

        if method == "random_forest":
            y_test, ans_best = random_forest(input_x,
                                             revised_y,
                                             X_train,
                                             y_train,
                                             X_test,
                                             y_test,
                                             judge=judge)
        elif method == "decision_tree":
            y_test, ans_best = decision_tree(input_x,
                                             revised_y,
                                             X_train,
                                             y_train,
                                             X_test,
                                             y_test,
                                             judge=judge)
        else:
            y_test, ans_best = xgboost(input_x,
                                       revised_y,
                                       X_train,
                                       y_train,
                                       X_test,
                                       y_test,
                                       class_num=class_num,
                                       num=epochs,
                                       judge=judge)

        if class_num == 2:
            classes = ['0', '1']
        elif class_num == 4:
            classes = ['0', '1', '2', '3']

        np.set_printoptions(precision=2)
        plot_confusion_matrix(y_test,
                              ans_best,
                              classes=classes,
                              normalize=False,
                              title=None,
                              cmap=plt.cm.Blues)
        plt.show()

    else:
        tmp_data = pie(num=11, source_data=source_data, all_option=True)
        final_data = combine(tmp_data, twitter_source, google_source,
                             ig_source)
        input_x, revised_y = data_preprocess(final_data, nan=nan)
        if method == "random_forest":
            y_test, ans_best = random_forest(input_x,
                                             revised_y,
                                             X_train=0,
                                             y_train=0,
                                             X_test=0,
                                             y_test=0,
                                             judge=judge)
        elif method == "decision_tree":
            y_test, ans_best = decision_tree(input_x,
                                             revised_y,
                                             X_train=0,
                                             y_train=0,
                                             X_test=0,
                                             y_test=0,
                                             judge=judge)
        else:
            y_test, ans_best = xgboost(input_x,
                                       revised_y,
                                       X_train=0,
                                       y_train=0,
                                       X_test=0,
                                       y_test=0,
                                       class_num=class_num,
                                       num=epochs,
                                       judge=judge)

        if class_num == 2:
            classes = ['0', '1']
        elif class_num == 4:
            classes = ['0', '1', '2', '3']

        np.set_printoptions(precision=2)
        plot_confusion_matrix(y_test,
                              ans_best,
                              classes=classes,
                              normalize=False,
                              title=None,
                              cmap=plt.cm.Blues)
        plt.show()
        if acc > best_acc:
            print('Saving..')
            torch.save(net.state_dict(), './model.pkl')
            best_acc = acc


best_acc = 0
train_acc = []
test_acc = []

classes = np.array([0, 1, 2, 3, 4])
pred_y = []
truth_y = []

if LOAD:
    print('Loading model ...')
    net.load_state_dict(torch.load(Model))
    test()
else:
    if CONT:
        print('Continue training !')
        net.load_state_dict(torch.load(Model))
    for epoch in range(EPOCH):
        train(epoch)
        test()

plot_confusion_matrix(truth_y, pred_y, classes, True)
plt.show()

#print(train_acc)
#print(test_acc)