def calc(args, X, X_test, final_model): print("-" * 100) print(LocalTime.get(), " Words selected report: SVM ") print("-" * 100) best_c = Linear_SVM.get_best_hyperparameter(args.X_train, args.Y_train, args.Y_val, args.X_val) final_svm = LinearSVC(C=best_c) final_svm.fit(X, args.Target) final_accuracy = final_svm.predict(X_test) final_accuracy_score = accuracy_score(args.Target_test, final_accuracy) print("Final SVM Accuracy: %s" % final_accuracy_score) Report_Matricies.accuracy(args.Target_test, final_accuracy) feature_names = zip(args.Cv.get_feature_names(), final_model.coef_[0]) feature_to_coef = {word: coef for word, coef in feature_names} itemz = feature_to_coef.items() list_positive = sorted(itemz, key=lambda x: x[1], reverse=True)[:args.Number_we_are_interested_in] print("-" * 100) print(LocalTime.get(), "--- Most popular positve words") for best_positive in list_positive: print(best_positive) print("-" * 100) print(LocalTime.get(), "--- Most popular negative words") list_negative = sorted( itemz, key=lambda x: x[1])[:args.Number_we_are_interested_in] for best_negative in list_negative: print(best_negative)
def calc(args, no_of_words): print("-" * 100) print(LocalTime.get(), " Words selected report: NGram where n = ", no_of_words) print("-" * 100) ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, no_of_words)) X = ngram_vectorizer.fit_transform(args.Train_text) X_test = ngram_vectorizer.transform(args.Test_text) best_c = Logistic_Regression.get_best_hyperparameter( args.X_train, args.Y_train, args.Y_val, args.X_val) final_ngram = LogisticRegression(C=best_c) final_ngram.fit(X, args.Target) final_accuracy = final_ngram.predict(X_test) final_accuracy_score = accuracy_score(args.Target_test, final_accuracy) print("Final NGram Accuracy: %s" % final_accuracy_score) Report_Matricies.accuracy(args.Target_test, final_accuracy) feature_names = zip(ngram_vectorizer.get_feature_names(), final_ngram.coef_[0]) feature_to_coef = {word: coef for word, coef in feature_names} itemz = feature_to_coef.items() list_positive = sorted(itemz, key=lambda x: x[1], reverse=True) print("-" * 100) print(LocalTime.get(), "--- Most popular positve words") for best_positive in list_positive[:args.Number_we_are_interested_in]: print(best_positive) print("-" * 100) print(LocalTime.get(), "--- Most popular negative words") list_negative = sorted(itemz, key=lambda x: x[1]) for best_negative in list_negative[:args.Number_we_are_interested_in]: print(best_negative)
def get_best_hyperparameter(X_train, y_train, y_val, X_val): best_accuracy = 0.0 best_c = 0.0 for c in [0.01, 0.05, 0.25, 0.5, 1]: svm = LinearSVC(C=c) svm.fit(X_train, y_train) accuracy_ = accuracy_score(y_val, svm.predict(X_val)) if accuracy_ > best_accuracy: best_accuracy = accuracy_ best_c = c print ("---SVM Accuracy for C=%s: %s" % (c, accuracy_)) print(LocalTime.get(), "best hyperparameter c = ", best_c) return best_c
def get_best_hyperparameter(X_train, y_train, y_val, X_val): # This gets the best hyperparameter for Regularisation best_accuracy = 0.0 best_c = 0.0 for c in [0.01, 0.05, 0.25, 0.5, 1]: lr = LogisticRegression(C=c) lr.fit(X_train, y_train) accuracy_ = accuracy_score(y_val, lr.predict(X_val)) if accuracy_ > best_accuracy: best_accuracy = accuracy_ best_c = c print("---Accuracy for C=%s: %s" % (c, accuracy_)) print(LocalTime.get(), "best hyperparameter for regularisation: c = ", best_c) return best_c
#%matplotlib inline import numpy as np import matplotlib.pyplot as plt import os import pickle from scipy.stats import truncnorm from files import Files from local_time import LocalTime from neural_network import NeuralNetwork print('\n' * 10) print("*"*100) print("** Start at ", LocalTime.get()) print("*"*100) image_size = 28 # width and height no_of_different_labels = 10 # i.e. 0, 1, 2, 3, ..., 9 image_pixels = image_size * image_size f = Files("pickled_mnist.pkl") with open(os.path.join(".", f.file_path), "br") as fh: data = pickle.load(fh) train_imgs = data[0] test_imgs = data[1] train_labels = data[2] test_labels = data[3] train_labels_one_hot = data[4] test_labels_one_hot = data[5] image_size = 28 # width and length no_of_different_labels = 10 # i.e. 0, 1, 2, 3, ..., 9 image_pixels = image_size * image_size
from datalook import Datalook from files import Files from logistic_regression import Logistic_Regression from linear_svm import Linear_SVM from local_time import LocalTime from sentences1 import Sentences1 from textblob import TextBlob from wordcloud import WordCloud, STOPWORDS from n_gram import N_Gram from svm import SVM from params import Params number_we_are_interested_in = 5 print('\n' * 10) print("=" * 80) print("Local current time started :", LocalTime.get()) print("=" * 80) twitter_file = "auspol2019.csv" f1 = Files(twitter_file) print(LocalTime.get(), twitter_file, " read") twitter = pd.read_csv(f1.file_path, parse_dates=['created_at', 'user_created_at']) twitter['sentiment'] = twitter['full_text'].map( lambda text: TextBlob(text).sentiment.polarity) print("twitter number of rows = ", twitter.shape[0]) ##### Remove neutral sentiments twitter1 = twitter[twitter.sentiment != 0] ####### Set targets to 1 for positive sentiment and 0 for negative sentiment print(LocalTime.get(), "sentiment rating calculated")
from local_time import LocalTime from sentences1 import Sentences1 from textblob import TextBlob from wordcloud import WordCloud, STOPWORDS from n_gram import N_Gram from svm import SVM from params import Params import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer number_we_are_interested_in = 5 print('\n' * 10) print("="*80) print("Local current time started :", LocalTime.get()) print("="*80) twitter_file = "auspol2019.csv" print(LocalTime.get(), twitter_file, " read") twitter = pd.read_csv('C:/Users/asus/Desktop/701_coursework-master/files/auspol2019.csv') print(twitter.shape) t1 = twitter.drop(['id','user_screen_name','user_location','user_description','user_name'], axis = 1) t1['full_text'] = t1['full_text'].astype(str) t1['full_text'] = t1['full_text'].apply(lambda x: " ".join(x.lower() for x in x.split())) stop = stopwords.words('english') t1['full_text'] = t1['full_text'].apply(lambda x:" ".join(x for x in x.split() if x not in stop)) st = PorterStemmer() t1['full_text'] = t1['full_text'].apply(lambda x:" ".join([st.stem(word) for word in x.split()])) t1['sentiment'] = t1['full_text'].map(lambda text: TextBlob(text).sentiment.polarity) twitter1 = t1[t1.sentiment != 0]
#%matplotlib inline import numpy as np import matplotlib.pyplot as plt import os import pickle from scipy.stats import truncnorm from files import Files from local_time import LocalTime from neural_network import NeuralNetwork print("*" * 100) print("** Start at ", LocalTime.get()) print("*" * 100) image_size = 28 # width and height no_of_different_labels = 10 # i.e. 0, 1, 2, 3, ..., 9 image_pixels = image_size * image_size f = Files("pickled_mnist.pkl") with open(os.path.join(".", f.file_path), "br") as fh: data = pickle.load(fh) train_imgs = data[0] test_imgs = data[1] train_labels = data[2] test_labels = data[3] train_labels_one_hot = data[4] test_labels_one_hot = data[5] lr = np.arange(no_of_different_labels) # transform labels into one hot representation train_labels_one_hot = (lr == train_labels).astype(np.float) test_labels_one_hot = (lr == test_labels).astype(np.float) # we don't want zeroes and ones in the labels neither: