def test_convert_to_TFIDF(self): import pickle import pandas as pd import numpy as np from keras.models import model_from_json from keras.preprocessing.sequence import pad_sequences revModel = "review_model_gpu.json" revModelWeights = "review_model_gpu.h5" # load json and create model (review predicts rating) json_file = open(revModel, 'r') loaded_model_json = json_file.read() json_file.close() review_model = model_from_json(loaded_model_json) review_model.load_weights(revModelWeights) review_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) pros_rev = input("Enter a pros review: ") cons_rev = input("Enter a cons review: ") combine_rev = preProcessing(pros_rev + " " + cons_rev) combine_rev = pd.Series(combine_rev) # loading with open('tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) maxlen = 200 tokenized_rev = tokenizer.texts_to_sequences(combine_rev) user_rev = pad_sequences(tokenized_rev, maxlen=maxlen, padding='post', truncating='post') # Predict rating based on user review (LSTM-CNN) model_pred = review_model.predict([user_rev], batch_size=1024, verbose=1) print("LSTM-CNN Overall Rating:", np.argmax(model_pred[0])) self.assertEqual(type(np.argmax(model_pred[0])) == str, True) self.assertEqual(type(np.argmax(model_pred[0])) == bool, False) self.assertEqual(type(np.argmax(model_pred[0])) == int, False)
# for i in x_train_rev.index] # #x_test_rev["review"] = [x_test_rev["pros"][i] if rating_pred_test[i] == 5 or rating_pred_test[i] == 3 # else x_test_rev["pros"][i] + ". " + x_test_rev["cons"][i] if rating_pred_test[i] == 4 or rating_pred_test[i] == 2 # else x_test_rev["cons"][i] for i in x_test_rev.index] """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ Data preprocessing (remove emoticons, remove non-alphabetic characters, remove digit, one hot encode labels, tokenization) """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" x_train_rev = x_train_rev["review"] x_test_rev = x_test_rev["review"] y_train_val = y_train.value_counts() y_test_val = y_test.value_counts() x_train_rev = x_train_rev.apply(lambda x: preProcessing(x)).reset_index( drop=True) x_test_rev = x_test_rev.apply(lambda x: preProcessing(x)).reset_index( drop=True) # One hot encode y y_train_rev = to_categorical(ytrain_arr) y_test_rev = to_categorical(ytest_arr) max_features = 20000 maxlen = 200 embed_size = 300 tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(x_train_rev)) tokenized_train = tokenizer.texts_to_sequences(x_train_rev) tokenized_test = tokenizer.texts_to_sequences(x_test_rev)
] """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """ Data preprocessing (train test split, remove non-alphabetic characters, remove digit, remove emoticons, one hot encode labels, tokenization) """ """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" """""" reviews = reviews[["review", "rating"]] X = reviews["review"] Y = reviews["rating"] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) #y_train_val = y_train.value_counts() #y_test_val = y_test.value_counts() #print (y_train_val) #print (y_test_val) x_train = x_train.apply(lambda x: preProcessing(x)).reset_index(drop=True) x_test = x_test.apply(lambda x: preProcessing(x)).reset_index(drop=True) # Array created for resampling purpose (data imbalanced) and one hot encoding ytrain_arr = np.array(y_train) ytest_arr = np.array(y_test) # One hot encode y y_train = to_categorical(ytrain_arr) y_test = to_categorical(ytest_arr) max_features = 20000 maxlen = 200 embed_size = 300 batch_size = 128 epochs = 20
ratings_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) # load svm models svm_model_rev = pickle.load(open("svm_model_rev.sav", "rb")) svm_model_rat = pickle.load(open("svm_model_rat.sav", "rb")) # Ask user to enter pros and cons reviews or aspect rating print() mode = input( "Use review or aspect ratings to predict overall rating (r/a)? ") if mode.strip().lower() == "r": pros_rev = input("Enter pros review: ") cons_rev = input("Enter cons review: ") combine_rev = preProcessing(pros_rev + " " + cons_rev) combine_rev = pd.Series(combine_rev) # loading with open('tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) maxlen = 200 tokenized_rev = tokenizer.texts_to_sequences(combine_rev) user_rev = pad_sequences(tokenized_rev, maxlen=maxlen, padding='post', truncating='post') # SVM with open("tfidfVectorizer.pickle", "rb") as handle:
def test_removeBoth(self): self.assertTrue(preProcessing("1@#$$abc23456") == "abc") self.assertTrue(preProcessing("asd12bcd*@#") == "asdbcd") self.assertTrue(preProcessing(" ") == "") self.assertTrue(preProcessing(" ") == "") self.assertTrue(preProcessing("abcdef") == "abcdef")
def test_removeDigits(self): self.assertTrue(preProcessing("123456") == "") self.assertTrue(preProcessing("asd12bcd") == "asdbcd") self.assertTrue(preProcessing("") == "")
def test_removeNonAlphabetics(self): self.assertTrue(preProcessing("#@$&@#)($*@)") == "") self.assertTrue(preProcessing("abc#@$&@#") == "abc") self.assertTrue(preProcessing("") == "")