Exemple #1
0
def predictResult(x_train, y_train, y_test, x_test):
    data2 = pd.read_csv("/tmp/predict_result.csv", header=0)
    # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array
    cols2 = data2.columns[(data2.columns != columnResultName)]
    fts2 = data2[cols2]
    fts2 = Normalizer().fit_transform(fts2)

    scores = cross_val_score(logisticR, x_train, y_train, n_jobs=30)
    print("scores cross val")
    print(scores)

    logisticR.fit(x_train, y_train)
    dump(logisticR, 'logistic.model')

    logisticLoaded = load('logistic.model')

    prFit = logisticLoaded.predict(x_test)
    print("predicao:", prFit)
    print("Matriz de Confusao LR:")
    print(cfm(y_test, prFit))
    print("F1 score LR:")
    print(f1s(y_test, prFit))
    print("Precision score LR:")
    print(ps(y_test, prFit))
    print("Recall score LR:")
    print(rs(y_test, prFit))
    print("Classification Report")
    print(cr(y_test, prFit))
    print("Accuracy score")
    print(asc(y_test, prFit))

    class_names = [0, 1]  # name  of classes
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    # create heatmap
    sns.heatmap(pd.DataFrame(cfm(y_test, prFit)),
                annot=True,
                cmap="YlGnBu",
                fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    y_pred_proba = logisticLoaded.predict_proba(x_test)[::, 1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label="data 1, auc=" + str(auc))
    plt.legend(loc=4)
    plt.show()

    pr1 = logisticLoaded.predict(fts2)
    print("predico unica", pr1)
    return pr1
Exemple #2
0
def predictResult(x_train, y_train, y_test, x_test):
    data2 = pd.read_csv("/tmp/predict_result.csv", header=0)
    # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array
    cols2 = data2.columns[(data2.columns != columnResultName)]
    fts2 = data2[cols2]
    fts2 = Normalizer().fit_transform(fts2)

    randomForest.fit(x_train, y_train)

    dump(randomForest, 'randomForest.model')

    randomForestLoaded = load('randomForest.model')
    prFit = randomForestLoaded.predict(x_test)
    print("predicao:", prFit)
    print("Matriz de Confusao LR:")
    print(cfm(y_test, prFit))
    print("F1 score LR:")
    print(f1s(y_test, prFit))
    print("Precision score LR:")
    print(ps(y_test, prFit))
    print("Recall score LR:")
    print(rs(y_test, prFit))
    print("Classification Report")
    print(cr(y_test, prFit))

    pr1 = randomForestLoaded.predict(fts2)
    print("predico unica", pr1)
    return pr1
Exemple #3
0
def predictResult(betterN, x_train, y_train, y_test, x_test):
    data2 = pd.read_csv("/tmp/predict_result.csv", header=0)
    # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array
    cols2 = data2.columns[(data2.columns != columnResultName)]
    fts2 = np.array(data2[cols2])

    #quando nao mandar um vaor de betterN, significa que demos o load do modelo
    if betterN > 0:
        knn.n_neighbors = betterN
        knn.fit(x_train, y_train)

        # dump(knn, 'models/knn_teste.joblib')

        prFit = knn.predict(x_test)
        print("predicao: a", prFit)
        print("Matriz de Confusao NB:")
        print(cfm(y_test, prFit))
        print("F1 score NB:")
        print(f1s(y_test, prFit))
        print("Precision score NB:")
        print(ps(y_test, prFit))
        print("Recall score NB:")
        print(rs(y_test, prFit))
        print("Classification Report")
        print(cr(y_test, prFit))

    pr1 = knn.predict(fts2)
    print("predico unica", int(pr1[0]))
    print("predicao unica score")
    print(pr1)
    return pr1
Exemple #4
0
def confusion_matrix(y_valid, y_pred):
    confusion_matrix = cfm(y_valid, y_pred)
    sensitivity = confusion_matrix[0, 0] / (confusion_matrix[0, 0] +
                                            confusion_matrix[1, 0])
    specificity = confusion_matrix[1, 1] / (confusion_matrix[1, 1] +
                                            confusion_matrix[0, 1])
    print("sensitivity = {}/({}+{}) = {}".format(confusion_matrix[0, 0],
                                                 confusion_matrix[0, 0],
                                                 confusion_matrix[1, 0],
                                                 sensitivity))
    print("specificity = {}/({}+{}) = {}".format(confusion_matrix[1, 1],
                                                 confusion_matrix[1, 1],
                                                 confusion_matrix[0, 1],
                                                 specificity))
Exemple #5
0
def confusion_matrix(truth, predictions):
	conf_mat = cfm(truth, predictions)
	
	return conf_mat
                             save_best_only=True,
                             mode='max',
                             period=2)
tensorboard = TensorBoard(log_dir='./logs', batch_size=batch_size)
callbacks_list = [checkpoint, tensorboard]

epochs = 25
steps_per_epoch = 100
number_of_validation_batches = generator_validation.n / batch_size
history = new_model.fit_generator(
    generator=generator_train,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=generator_validation,
    validation_steps=number_of_validation_batches,
    callbacks=callbacks_list)
acc = history.history['categorical_accuracy']

#create dataloader fr this one.
prediction = new_model.predict_generator(generator_test)
print(type(prediction), type(test_ID_list))
pred_name = np.argmax(prediction, axis=1)
mat = cfm(generator_test.classes, pred_name)
print("confusion matrix")
print(mat)
true = 0.0
for i in range(len(mat)):
    true += mat[i][i]
acc = true / len(test_ID_list)
print("test accuracy", acc)
    for (index, replacement) in zip(unique, range(len(unique))):
        ID_list[ID_list == index] = replacement
    return imgs, ID_list


# #dataArrays
test_image_array, test_ID_list = parse(test_dir)
train_image_array, train_ID_list = parse(train_dir)
print("arrays created")

# #get output from VGG
layer_name = 'fc2'
fc2_layer_model = Model(inputs=model.input,
                        outputs=model.get_layer(layer_name).output)

#get train and test features
feature = fc2_layer_model.predict(train_image_array)
testfeature = fc2_layer_model.predict(test_image_array)
print("features created")

#SVM fitting
scaler = StandardScaler()
scaler.fit(feature)
feature = scaler.transform(feature)
testfeature = scaler.transform(testfeature)
svc = LinearSVC(random_state=5)
svc.fit(testfeature, test_ID_list)
ycap = svc.predict(testfeature)
print(cfm(test_ID_list, ycap))
print('test score is :', svc.score(testfeature, test_ID_list))
Exemple #8
0
    print(
        cr(test_label.argmax(axis=1),
           prediction_1.argmax(axis=1),
           target_names=label_names))  # Classification Report

    # Training and Validation Curves
    training_and_validation_accuracy(result_1)
    training_and_validation_loss(result_1)

    # Confusion Matrix Visualization
    prediction_class_1 = np.argmax(
        prediction_1, axis=1)  # Convert predictions classes to one hot vectors
    test_label_cfm = np.argmax(
        test_label,
        axis=1)  # Convert validation observations to one hot vectors
    confusion_mtx = cfm(test_label_cfm,
                        prediction_class_1)  # Compute the confusion matrix
    plot_confusion_matrix(confusion_mtx,
                          classes=label_names)  # Plot the confusion matrix

    random_test_images(model_1)
else:
    model_2, result_2 = optimize(cnn_model_two())  # Train CNN Model 2

    prediction_2 = model_2.predict(test_data)
    print("Evaluate Test")
    model_2.evaluate(test_data, test_label)
    print(
        cr(test_label.argmax(axis=1),
           prediction_2.argmax(axis=1),
           target_names=label_names))  # Classification Report
    else:
        print("No aspect/opinion pairs")
    if has_outliers:
        outliers_dic[stars] += 1
    print(
        "----------------------------------------------------------------------------------------------------------"
    )
    print(
        "----------------------------------------------------------------------------------------------------------"
    )
    input()

y_true = [p[0] for p in predictions]
y_pred = [p[1] for p in predictions]
cm = cfm(y_true, y_pred)
labels = ['NEG', "NEU", "POS"]
f = seaborn.heatmap(cm,
                    annot=True,
                    xticklabels=labels,
                    yticklabels=labels,
                    fmt='g')

print("Total reviews: " + str(len(revs)))
print("Predicted: " + str(len(predictions)))
print(metrics.classification_report(y_true, y_pred))

inc_revs = {"revs": incoherent_revs}
print("Incoherent reviews: " + str(len(incoherent_revs)))
print("Outliers Stats: ", outliers_dic)
                                                  labels,
                                                  test_size=0.25,
                                                  shuffle=True,
                                                  stratify=labels,
                                                  random_state=5)

print("number of training data points:   ", x_train.shape[0])
print('number of validation data points: ', x_val.shape[0])

#Building and Fitting a simple Logistic Regression Model on training data
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)

#Results of Logistic Regression model
y_pred = log_reg.predict(x_val)
print('accuracy: ', log_reg.score(x_val, y_val))
print('f1_score: ', f1_score(y_val, y_pred))
print('confusion matrix: ')
print(cfm(y_val, y_pred))

#Using the trained regression model for predicting likelihood score on test data
prediction = log_reg.predict_proba(test_vectorised)
model_predictions = pd.DataFrame({
    'news': test_news,
    'virality_likelihood': prediction[:, 1] * 100
})
model_predictions.head()

#Saving predictions to csv file
model_predictions.to_csv('predicted_virality.csv')
Exemple #11
0
pred_arr = np.array([])
real_df = pd.read_csv('id_label_val.csv')
pred_df = pd.read_csv('output.csv')    
real_arr = real_df.LABEL.values    
for i in range(3595, 4793):
    temp1 = pred_df[pred_df.ID == i]
    if len(temp1[temp1.LABEL == 1]) > 0:
        pred_arr = np.append(pred_arr, 1)        
    else:
        pred_arr = np.append(pred_arr, 0)

fpr, tpr, _ = roc_curve(real_arr, pred_arr)
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

# conf matrix
print cfm(real_arr, pred_arr)
# print clr(real_arr, pred_arr)
def evaluate(params, mode="test", confusion=False):
    if mode == "valid":
        data_dir = ENV.valid_dir
        num_batches = ENV.num_valid_batches
        pattern = "valid"
    elif mode == "test":
        data_dir = ENV.test_dir
        num_batches = ENV.num_test_batches
        pattern = "test"
    else:
        raise ("Evaluate doesn't know passed mode")
    with tf.variable_scope("train"):
        x, y = create_data_reader(data_dir,
                                  mode="eval",
                                  batch_size=params.batch_size,
                                  pattern=pattern)
        rn = build_model(x, y, params, mode="eval")
    save_to = os.path.join(ENV.save_to_dir, mode)
    model_dir = os.path.join(ENV.save_to_dir, "train")
    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(save_to)
    config = tf.ConfigProto(allow_soft_placement=True)
    sess = tf.Session(config=config)
    tf.train.start_queue_runners(sess)
    while True:
        #time.sleep(30)
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if not checkpoint or not checkpoint.model_checkpoint_path:
            tf.logging.error("Model not found {0}".format(model_dir))
            continue
        tf.logging.info("Start loading checkpoint at {0}".format(model_dir))
        saver.restore(sess, checkpoint.model_checkpoint_path)
        correct, total, total_loss = 0.0, 0.0, 0.0
        cfm_pred = []
        cfm_true = []
        for i in range(num_batches):
            run = [
                rn.inference, rn.labels, rn.loss, rn.global_step, rn.summary
            ]
            pred, true, loss, step, summary = sess.run(run)
            pred = np.argmax(pred, axis=1)
            true = np.argmax(true, axis=1)
            if confusion:
                cfm_pred = np.append(cfm_pred, pred)
                cfm_true = np.append(cfm_true, true)
            correct += np.sum(true == pred)
            total_loss += loss
            total += pred.shape[0]
        precision = correct / total
        summary_precision = tf.Summary()
        summary_precision.value.add(tag=mode + "_precision",
                                    simple_value=precision)
        writer.add_summary(summary_precision, step)
        summary_loss = tf.Summary()
        total_loss /= num_batches
        summary_loss.value.add(tag=mode + "_loss", simple_value=total_loss)
        writer.add_summary(summary_loss, step)
        msg = "{0}_precision: {1:.5f}, {0}_loss: {2:.5f}"
        tf.logging.info(msg.format(mode, precision, total_loss))
        writer.flush()
        if confusion:
            confusion_matrix = cfm(cfm_true, cfm_pred)
            cfm_path = ENV.save_to_dir + "/confusion_matrix.p"
            with open(cfm_path, "wb") as cfm_file:
                pickle.dump(confusion_matrix, cfm_file)
        break
Exemple #13
0
from sklearn.metrics import confusion_matrix as cfm

with open("output_dev.txt") as f:
	data = f.readlines()

pred = []
y = []
label = {"drug":1,"person":2,"place":3,"movie":4,"company":5}

for line in data:
	strs = line.split()
	for s in strs:
		if "guess=" in s:
			pred.append(label[s.replace("guess=","")])
		if "gold=" in s:
			y.append(label[s.replace("gold=","")])

print cfm(y,pred)

index = [i for i,e in enumerate(y) if e!=pred[i]]
for idx in index:
	print data[idx].strip()
						
Exemple #14
0
neighbors = [3, 5, 7, 13, 1]

for n in range(0, 5):
    print "Quantidade Vizinhos:", neighbors[n]
    knn3 = KNeighborsClassifier(n_neighbors=neighbors[n])
    knn3.fit(x_train, y_train)

    print "Accuracy Training KNN:", knn3.score(x_train, y_train)

    predictions = knn3.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, predictions)

    print "Accuracy Test KNN:", accuracy
    print "Matriz de Confusao KNN:"
    print cfm(y_test, predictions)
    print "F1 score KNN:"
    print f1s(y_test, predictions)
    print "Precision score KNN:"
    print ps(y_test, predictions)
    print "Recall score KNN:"
    print rs(y_test, predictions)

#svm kernel linear

svm = svm.SVC(kernel='linear', C=1.0)
svm.fit(x_train, y_train)

predictionsSvm = svm.predict(x_test)

accuracySvm = metrics.accuracy_score(predictionsSvm, y_test)