def develop(): # Read the data if not os.path.isfile('./xtr.npy') or \ not os.path.isfile('./xte.npy') or \ not os.path.isfile('./ytr.npy') or \ not os.path.isfile('./yte.npy'): xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv") np.save('./xtr', xtr) np.save('./xte', xte) np.save('./ytr', ytr) np.save('./yte', yte) xtr = np.load('./xtr.npy') xte = np.load('./xte.npy') ytr = np.load('./ytr.npy') yte = np.load('./yte.npy') print("Here") # Use the built-in SVM for classification clf = SVC() clf.fit(xtr, ytr) y_pred = clf.predict(xte) m = yte.shape[0] n = (yte != y_pred).sum() print("Accuracy = " + format((m - n) / m * 100, '.2f') + "%") # 88.42% filename = 'finalized_model.pkl' pickle.dump(clf, open(filename, 'wb')) print("Classified") # Draw the confusion matrix plot_cmat(yte, y_pred)
def random_forest_model(): """ In this function the support vector machine classified is built """ ''' Read the data from all the .npy file if file exist, and if not then call the getEmbeddings function to create the .npy files. more about getEmbeddings is in the getEmbeddings.py NOTE: .npy stands for numpy array ''' if not os.path.isfile('./xtr.npy') or \ not os.path.isfile('./xte.npy') or \ not os.path.isfile('./ytr.npy') or \ not os.path.isfile('./yte.npy'): xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv") np.save('./xtr', xtr) np.save('./xte', xte) np.save('./ytr', ytr) np.save('./yte', yte) #Load the files to local variables. xtr = np.load('./xtr.npy') xte = np.load('./xte.npy') ytr = np.load('./ytr.npy') yte = np.load('./yte.npy') # Use the built-in Random Forest classifier ''' creating the classiier RandomForestClassifier() fitting the model with xte(xtranning) and ytr(ytranning) ''' rdf = RandomForestClassifier(n_estimators=200, n_jobs=3) rdf.fit(xtr, ytr) #Saving the models in the random_forest_model.sav file so that we can use pretranied model model_file = 'random_forest_model.sav' pickle.dump(rdf, open(model_file, 'wb')) #Prediction the y_pred values for xte(xtest) y_pred = rdf.predict(xte) #Printing the accuracy of print("Accuracy = " + format(metrics.accuracy_score(yte, y_pred) * 100, '.2f') + "%") # Draw the confusion matrix plot_cmat(yte, y_pred)
def main(): # Get the training and testing data from getEmbeddings if not os.path.isfile('./xtr.npy') or \ not os.path.isfile('./xte.npy') or \ not os.path.isfile('./ytr.npy') or \ not os.path.isfile('./yte.npy'): xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv") np.save('./xtr', xtr) np.save('./xte', xte) np.save('./ytr', ytr) np.save('./yte', yte) # Read the Doc2Vec data train_data = np.load('./xtr.npy') eval_data = np.load('./xte.npy') train_labels = np.load('./ytr.npy') eval_labels = np.load('./yte.npy') train_labels = train_labels.reshape((-1, 1)).astype(np.int32) eval_labels = eval_labels.reshape((-1, 1)).astype(np.int32) # Create the Estimator classifier = \ tf.estimator.Estimator(model_fn=model_fn, model_dir=tensorflow_tmp) # Setup logging hook for prediction tf.logging.set_verbosity(tf.logging.INFO) tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=200) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data}, y=train_labels, batch_size=50, num_epochs=None, shuffle=True) classifier.train(input_fn=train_input_fn, steps=TRAIN_STEP, hooks=[logging_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = classifier.evaluate(input_fn=eval_input_fn) print(eval_results) # 81.42% # Draw the confusion matrix predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, num_epochs=1, shuffle=False) predict_results = classifier.predict(input_fn=predict_input_fn) predict_labels = [label["classes"] for label in predict_results] plot_cmat(eval_labels, predict_labels)
def logistic_regression_model(): """ In this function the logistic regression classified is built """ ''' Read the data from all the .npy file if file exist, and if not then call the getEmbeddings function to create the .npy files. more about getEmbeddings is in the getEmbeddings.py NOTE: .npy stands for numpy array ''' if not os.path.isfile('./xtr.npy') or \ not os.path.isfile('./xte.npy') or \ not os.path.isfile('./ytr.npy') or \ not os.path.isfile('./yte.npy'): xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv") np.save('./xtr', xtr) np.save('./xte', xte) np.save('./ytr', ytr) np.save('./yte', yte) #Load the files to local variables. xtr = np.load('./xtr.npy') xte = np.load('./xte.npy') ytr = np.load('./ytr.npy') yte = np.load('./yte.npy') # Use the built-in Naive Bayes classifier of scikit learn library ''' creating the classiier LogisticRegression() fitting the model with xte(xtranning) and ytr(ytranning) ''' log_R = linear_model.LogisticRegression() log_R.fit(xtr, ytr) #Saving the models in the logistic_regression.sav file so that we can use pretranied model model_file = 'logistic_regression_model.sav' pickle.dump(log_R, open(model_file, 'wb')) #Prediction the y_pred values for xte(xtest) y_pred = log_R.predict(xte) #Printing the accuracy of print("Accuracy = " + format(metrics.accuracy_score(yte, y_pred) * 100, '.2f') + "%") # Draw the confusion matrix plot_cmat(yte, y_pred)
def main(): # Get the training and testing data from getEmbeddings train_data, eval_data, train_labels, eval_labels = \ getEmbeddings("datasets/train.csv") train_labels = train_labels.reshape((-1, 1)).astype(np.int32) eval_labels = eval_labels.reshape((-1, 1)).astype(np.int32) # Create the Estimator classifier = \ tf.estimator.Estimator(model_fn=model_fn, model_dir=tensorflow_tmp) # Setup logging hook for prediction tf.logging.set_verbosity(tf.logging.INFO) tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=200) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data}, y=train_labels, batch_size=50, num_epochs=None, shuffle=True) classifier.train(input_fn=train_input_fn, steps=TRAIN_STEP, hooks=[logging_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = classifier.evaluate(input_fn=eval_input_fn) print(eval_results) # 81.42% # Draw the confusion matrix predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, num_epochs=1, shuffle=False) predict_results = classifier.predict(input_fn=predict_input_fn) predict_labels = [label["classes"] for label in predict_results] plot_cmat(eval_labels, predict_labels) print(eval_results) # 81.42% from sklearn.metrics import classification_report target_names = ['class 0', 'class 1'] print( classification_report(eval_labels, predict_labels, target_names=target_names))
def data(): # Get the training and testing data from getEmbeddings if not os.path.isfile('./xtr.npy') or \ not os.path.isfile('./xte.npy') or \ not os.path.isfile('./ytr.npy') or \ not os.path.isfile('./yte.npy'): xtr,xte,ytr,yte = getEmbeddings("datasets/train.csv") np.save('./xtr', xtr) np.save('./xte', xte) np.save('./ytr', ytr) np.save('./yte', yte) # Read the Doc2Vec data train_data = np.load('./xtr.npy') eval_data = np.load('./xte.npy') train_labels = np.load('./ytr.npy') eval_labels = np.load('./yte.npy') train_labels = train_labels.reshape((-1, 1)).astype(np.int32) #eval_labels = eval_labels.reshape((-1, 1)).astype(np.int32) return train_data, train_labels.flatten()
from getEmbeddings import getEmbeddings from sklearn.naive_bayes import GaussianNB import numpy as np import matplotlib.pyplot as plt import scikitplot.plotters as skplt def plot_cmat(yte, ypred): '''Plotting confusion matrix''' skplt.plot_confusion_matrix(yte, ypred) plt.show() xtr, xte, ytr, yte = getEmbeddings("datasets/train.csv") np.save('./xtr', xtr) np.save('./xte', xte) np.save('./ytr', ytr) np.save('./yte', yte) xtr = np.load('./xtr.npy') xte = np.load('./xte.npy') ytr = np.load('./ytr.npy') yte = np.load('./yte.npy') gnb = GaussianNB() gnb.fit(xtr, ytr) y_pred = gnb.predict(xte) m = yte.shape[0] n = (yte != y_pred).sum() print("Accuracy = " + format((m - n) / m * 100, '.2f') + "%") # 72.94%
import scikitplot.plotters as skplt import os def plot_cmat(yte, ypred): '''Plotting confusion matrix''' skplt.plot_confusion_matrix(yte, ypred) plt.show() # Read the data if not os.path.isfile('./xtr.npy') or \ not os.path.isfile('./xte.npy') or \ not os.path.isfile('./ytr.npy') or \ not os.path.isfile('./yte.npy'): xtr, xte, ytr, yte = getEmbeddings("C:/Users/admin/fakenews/train.csv") np.save('./xtr', xtr) np.save('./xte', xte) np.save('./ytr', ytr) np.save('./yte', yte) xtr = np.load('./xtr.npy') xte = np.load('./xte.npy') ytr = np.load('./ytr.npy') yte = np.load('./yte.npy') # Use the built-in Naive Bayes classifier gnb = GaussianNB() gnb.fit(xtr, ytr) y_pred = gnb.predict(xte) m = yte.shape[0]
from keras.layers import Dense, Dropout, LSTM, Embedding, Input, RepeatVector from keras.optimizers import SGD from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import scikitplot.plotters as skplt #plots confusion matrix def plot_cmat(yte, ypred): skplt.plot_confusion_matrix(yte, ypred) plt.show() xtr, xte, ytr, yte = getEmbeddings( "/home/rashi/Desktop/task_NUS/Fake-news-Detection-master/datasets/train.csv" ) np.save('./xtr', xtr) np.save('./xte', xte) np.save('./ytr', ytr) np.save('./yte', yte) #NumPy array file to reconstruct an array in any computer xtr = np.load('./xtr.npy') xte = np.load('./xte.npy') ytr = np.load('./ytr.npy') yte = np.load('./yte.npy') def baseline_model(): model = Sequential()