Ejemplo n.º 1
0
def develop():

    # Read the data
    if not os.path.isfile('./xtr.npy') or \
        not os.path.isfile('./xte.npy') or \
        not os.path.isfile('./ytr.npy') or \
        not os.path.isfile('./yte.npy'):
        xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv")
        np.save('./xtr', xtr)
        np.save('./xte', xte)
        np.save('./ytr', ytr)
        np.save('./yte', yte)

    xtr = np.load('./xtr.npy')
    xte = np.load('./xte.npy')
    ytr = np.load('./ytr.npy')
    yte = np.load('./yte.npy')
    print("Here")
    # Use the built-in SVM for classification
    clf = SVC()
    clf.fit(xtr, ytr)
    y_pred = clf.predict(xte)
    m = yte.shape[0]
    n = (yte != y_pred).sum()
    print("Accuracy = " + format((m - n) / m * 100, '.2f') + "%")  # 88.42%

    filename = 'finalized_model.pkl'
    pickle.dump(clf, open(filename, 'wb'))
    print("Classified")

    # Draw the confusion matrix
    plot_cmat(yte, y_pred)
Ejemplo n.º 2
0
def random_forest_model():
    """
        In this function the support vector machine classified is built
    
    """
    '''
        Read the data from all the .npy file if file exist,
        and if not then call the getEmbeddings function to 
        create the .npy files.
        more about getEmbeddings is in the getEmbeddings.py
        NOTE: .npy stands for numpy array
    '''


    if not os.path.isfile('./xtr.npy') or \
        not os.path.isfile('./xte.npy') or \
        not os.path.isfile('./ytr.npy') or \
        not os.path.isfile('./yte.npy'):
        xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv")
        np.save('./xtr', xtr)
        np.save('./xte', xte)
        np.save('./ytr', ytr)
        np.save('./yte', yte)

    #Load the files to local variables.
    xtr = np.load('./xtr.npy')
    xte = np.load('./xte.npy')
    ytr = np.load('./ytr.npy')
    yte = np.load('./yte.npy')

    # Use the built-in Random Forest classifier
    '''
        creating the classiier RandomForestClassifier() 
        fitting the model with xte(xtranning) and ytr(ytranning)
    
    '''

    rdf = RandomForestClassifier(n_estimators=200, n_jobs=3)
    rdf.fit(xtr, ytr)

    #Saving the models in the random_forest_model.sav file so that we can use pretranied model

    model_file = 'random_forest_model.sav'
    pickle.dump(rdf, open(model_file, 'wb'))

    #Prediction the y_pred values for xte(xtest)
    y_pred = rdf.predict(xte)

    #Printing the accuracy of
    print("Accuracy = " +
          format(metrics.accuracy_score(yte, y_pred) * 100, '.2f') + "%")

    # Draw the confusion matrix
    plot_cmat(yte, y_pred)
Ejemplo n.º 3
0
def main():
    # Get the training and testing data from getEmbeddings
    if not os.path.isfile('./xtr.npy') or \
        not os.path.isfile('./xte.npy') or \
        not os.path.isfile('./ytr.npy') or \
        not os.path.isfile('./yte.npy'):
        xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv")
        np.save('./xtr', xtr)
        np.save('./xte', xte)
        np.save('./ytr', ytr)
        np.save('./yte', yte)
    # Read the Doc2Vec data
    train_data = np.load('./xtr.npy')
    eval_data = np.load('./xte.npy')
    train_labels = np.load('./ytr.npy')
    eval_labels = np.load('./yte.npy')
    train_labels = train_labels.reshape((-1, 1)).astype(np.int32)
    eval_labels = eval_labels.reshape((-1, 1)).astype(np.int32)

    # Create the Estimator
    classifier = \
        tf.estimator.Estimator(model_fn=model_fn, model_dir=tensorflow_tmp)

    # Setup logging hook for prediction
    tf.logging.set_verbosity(tf.logging.INFO)
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=200)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data},
                                                        y=train_labels,
                                                        batch_size=50,
                                                        num_epochs=None,
                                                        shuffle=True)
    classifier.train(input_fn=train_input_fn,
                     steps=TRAIN_STEP,
                     hooks=[logging_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=False)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)  # 81.42%

    # Draw the confusion matrix
    predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                          num_epochs=1,
                                                          shuffle=False)
    predict_results = classifier.predict(input_fn=predict_input_fn)
    predict_labels = [label["classes"] for label in predict_results]
    plot_cmat(eval_labels, predict_labels)
Ejemplo n.º 4
0
def logistic_regression_model():
    """
    In this function the logistic regression classified is built
    
    """
    '''
        Read the data from all the .npy file if file exist,
        and if not then call the getEmbeddings function to 
        create the .npy files.
        more about getEmbeddings is in the getEmbeddings.py
        NOTE: .npy stands for numpy array
    '''


    if not os.path.isfile('./xtr.npy') or \
        not os.path.isfile('./xte.npy') or \
        not os.path.isfile('./ytr.npy') or \
        not os.path.isfile('./yte.npy'):
        xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv")
        np.save('./xtr', xtr)
        np.save('./xte', xte)
        np.save('./ytr', ytr)
        np.save('./yte', yte)

    #Load the files to local variables.
    xtr = np.load('./xtr.npy')
    xte = np.load('./xte.npy')
    ytr = np.load('./ytr.npy')
    yte = np.load('./yte.npy')

    # Use the built-in Naive Bayes classifier of scikit learn library
    '''
        creating the classiier LogisticRegression() 
        fitting the model with xte(xtranning) and ytr(ytranning)
    
    '''

    log_R = linear_model.LogisticRegression()
    log_R.fit(xtr, ytr)

    #Saving the models in the logistic_regression.sav file so that we can use pretranied model
    model_file = 'logistic_regression_model.sav'
    pickle.dump(log_R, open(model_file, 'wb'))

    #Prediction the y_pred values for xte(xtest)
    y_pred = log_R.predict(xte)

    #Printing the accuracy of
    print("Accuracy = " +
          format(metrics.accuracy_score(yte, y_pred) * 100, '.2f') + "%")

    # Draw the confusion matrix
    plot_cmat(yte, y_pred)
Ejemplo n.º 5
0
def main():
    # Get the training and testing data from getEmbeddings
    train_data, eval_data, train_labels, eval_labels = \
        getEmbeddings("datasets/train.csv")
    train_labels = train_labels.reshape((-1, 1)).astype(np.int32)
    eval_labels = eval_labels.reshape((-1, 1)).astype(np.int32)

    # Create the Estimator
    classifier = \
        tf.estimator.Estimator(model_fn=model_fn, model_dir=tensorflow_tmp)

    # Setup logging hook for prediction
    tf.logging.set_verbosity(tf.logging.INFO)
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=200)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data},
                                                        y=train_labels,
                                                        batch_size=50,
                                                        num_epochs=None,
                                                        shuffle=True)
    classifier.train(input_fn=train_input_fn,
                     steps=TRAIN_STEP,
                     hooks=[logging_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=False)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)  # 81.42%

    # Draw the confusion matrix
    predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                          num_epochs=1,
                                                          shuffle=False)
    predict_results = classifier.predict(input_fn=predict_input_fn)
    predict_labels = [label["classes"] for label in predict_results]
    plot_cmat(eval_labels, predict_labels)
    print(eval_results)  # 81.42%
    from sklearn.metrics import classification_report
    target_names = ['class 0', 'class 1']
    print(
        classification_report(eval_labels,
                              predict_labels,
                              target_names=target_names))
Ejemplo n.º 6
0
def data():
    # Get the training and testing data from getEmbeddings
    if not os.path.isfile('./xtr.npy') or \
        not os.path.isfile('./xte.npy') or \
        not os.path.isfile('./ytr.npy') or \
        not os.path.isfile('./yte.npy'):
        xtr,xte,ytr,yte = getEmbeddings("datasets/train.csv")
        np.save('./xtr', xtr)
        np.save('./xte', xte)
        np.save('./ytr', ytr)
        np.save('./yte', yte)
    # Read the Doc2Vec data
    train_data = np.load('./xtr.npy')
    eval_data = np.load('./xte.npy')
    train_labels = np.load('./ytr.npy')
    eval_labels = np.load('./yte.npy')
    train_labels = train_labels.reshape((-1, 1)).astype(np.int32)
    #eval_labels = eval_labels.reshape((-1, 1)).astype(np.int32)
   
    return train_data, train_labels.flatten() 
Ejemplo n.º 7
0
from getEmbeddings import getEmbeddings
from sklearn.naive_bayes import GaussianNB
import numpy as np
import matplotlib.pyplot as plt
import scikitplot.plotters as skplt


def plot_cmat(yte, ypred):
    '''Plotting confusion matrix'''
    skplt.plot_confusion_matrix(yte, ypred)
    plt.show()


xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv")
np.save('./xtr', xtr)
np.save('./xte', xte)
np.save('./ytr', ytr)
np.save('./yte', yte)

xtr = np.load('./xtr.npy')
xte = np.load('./xte.npy')
ytr = np.load('./ytr.npy')
yte = np.load('./yte.npy')

gnb = GaussianNB()
gnb.fit(xtr, ytr)
y_pred = gnb.predict(xte)
m = yte.shape[0]
n = (yte != y_pred).sum()
print("Accuracy = " + format((m - n) / m * 100, '.2f') + "%")  # 72.94%
Ejemplo n.º 8
0
import scikitplot.plotters as skplt
import os


def plot_cmat(yte, ypred):
    '''Plotting confusion matrix'''
    skplt.plot_confusion_matrix(yte, ypred)
    plt.show()


# Read the data
if not os.path.isfile('./xtr.npy') or \
    not os.path.isfile('./xte.npy') or \
    not os.path.isfile('./ytr.npy') or \
    not os.path.isfile('./yte.npy'):
    xtr, xte, ytr, yte = getEmbeddings("C:/Users/admin/fakenews/train.csv")
    np.save('./xtr', xtr)
    np.save('./xte', xte)
    np.save('./ytr', ytr)
    np.save('./yte', yte)

xtr = np.load('./xtr.npy')
xte = np.load('./xte.npy')
ytr = np.load('./ytr.npy')
yte = np.load('./yte.npy')

# Use the built-in Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(xtr, ytr)
y_pred = gnb.predict(xte)
m = yte.shape[0]
Ejemplo n.º 9
0
from keras.layers import Dense, Dropout, LSTM, Embedding, Input, RepeatVector
from keras.optimizers import SGD
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import scikitplot.plotters as skplt


#plots confusion matrix
def plot_cmat(yte, ypred):

    skplt.plot_confusion_matrix(yte, ypred)
    plt.show()


xtr, xte, ytr, yte = getEmbeddings(
    "/home/rashi/Desktop/task_NUS/Fake-news-Detection-master/datasets/train.csv"
)
np.save('./xtr', xtr)
np.save('./xte', xte)
np.save('./ytr', ytr)
np.save('./yte', yte)
#NumPy array file to reconstruct an array in any computer
xtr = np.load('./xtr.npy')
xte = np.load('./xte.npy')
ytr = np.load('./ytr.npy')
yte = np.load('./yte.npy')


def baseline_model():

    model = Sequential()