def read_train(filename, stem=False, bigram=False): global doc_data, cnt, size_voc count = [0] * 5 cnt = 0 for i in range(5): class_dict[i][0] = Counter() for doc in utils.json_reader(filename): #txt = word_tokenize(doc["text"].lower()) txt = re.findall(r"[\w']+", doc["text"].lower()) #txt = doc["text"].split(" ") if stem == True: s = " " txt = utils.getStemmedDocuments(s.join(txt)) txt = [item for item in txt if not item.isdigit()] if bigram == True: txt = (list(nltk.bigrams(txt))) vocab.update(txt) cnt = cnt + 1 class_dict[int(doc["stars"]) - 1][0].update(txt) class_dict[int(doc["stars"]) - 1][2] += 1 doc_data.append([doc["stars"], Counter(txt)]) for i in range(5): class_dict[i][1] = sum(class_dict[i][0].values()) print class_dict[i][1], class_dict[i][2] print "vocab" print len(vocab) size_voc = len(vocab)
def read_t(filename, stem=False, bigram=False): global doc_data for doc in utils.json_reader(filename): #txt = word_tokenize(doc["text"].lower()) txt = re.findall(r"[\w']+", doc["text"].lower()) #txt = doc["text"].split(" ") if stem == True: s = " " txt = utils.getStemmedDocuments(s.join(txt)) txt = [item for item in txt if not item.isdigit()] if bigram == True: txt = (list(nltk.bigrams(txt))) doc_data.append([doc["stars"], Counter(txt)])
def main(): # test = "Stopped here today to give it a try and must admit the food was excellent" # bigram = nltk.bigrams(test.split()) # print(list(map(''.join, bigram))) #Making Vocabulary for different labels out of training data vocab_list = [{}, {}, {}, {}, {}] vocabulary = {} vocab_list_bigrams = [{}, {}, {}, {}, {}] vocabulary_bigrams = {} #Count of each label in training data label_count = np.zeros(5) label_word_count = np.zeros(5) label_bigram_count = np.zeros(5) start1 = time.time() ############################################################################## #Training part iter = (ut.json_reader(train)) # for i in range(TRAINFULLSIZE): i1 = 0 for element in iter: i1 += 1 if (i1 % 1000) == 0: print("Training: ", i1 / 1000) # for i in range(1): # element = next(iter) label_count[int(element["stars"]) - 1] += 1 # print((remove_duplicates((element["text"]).split()))) label_word_count[int(element["stars"]) - 1] += len( (element["text"]).split()) # Switch these lines for stemming stemmed = ut.getStemmedDocuments(element["text"]) bigram = nltk.bigrams(stemmed) bigramlist = list(map(''.join, bigram)) label_word_count[int(element["stars"]) - 1] += len(stemmed) label_bigram_count[int(element["stars"]) - 1] += len(bigramlist) # stemmed.extend(bigramlist) # print(stemmed) for x in (stemmed): # for x in ((element["text"]).split()): word = x.strip(string.punctuation) # word = x # print(word) if word == "": continue if word in vocab_list[int(element["stars"] - 1)]: (vocab_list[int(element["stars"]) - 1])[word] += 1 else: (vocab_list[int(element["stars"]) - 1])[word] = 1 vocabulary[word] = 1 for x in (bigramlist): # for x in ((element["text"]).split()): word = x.strip(string.punctuation) # word = x # print(word) if word == "": continue if word in vocab_list_bigrams[int(element["stars"] - 1)]: (vocab_list_bigrams[int(element["stars"]) - 1])[word] += 1 else: (vocab_list_bigrams[int(element["stars"]) - 1])[word] = 1 vocabulary_bigrams[word] = 1 ############################################################################## end1 = time.time() print("Training done, Time taken(mins)", int(end1 - start1) / 60) # print(len(vocab)) # count=0; # for i in range(5): # print(label_count[i]) # count+=(label_count[i]) # print(count) prior = label_count / TRAINSIZE # print(prior) actual_value = [] predicted_value = [] random_prediction = [] start2 = time.time() ############################################################################## #TESTING i2 = 0 iter2 = (ut.json_reader(test)) for test_element in iter2: i2 += 1 if (i2 % 1000) == 0: print("Testing: ", i2 / 1000) # print(i) #Random number between 1-5 random_prediction.append(random.randint(1, 6)) # test_element = next(iter2) actual_value.append(int(test_element["stars"])) # test = "Stopped here today to give it a try and must admit the food was excellent. I ordered the vegetarian Soyrizo (fake sausage) burrito and fell in love. It was well worth the $6. It's not like the big chain restaurants where they serve you a massive sloppy burrito. It was the perfect size and easily handled. \nIt's small and quaint, with some seating outside in under a canopy. The owners were a lovely couple, passionate about their food. \nExcellent." # test = "Fast, easy, helpful. In and out quickly and got the medicine I needed. Smart staff who was kind and helpful. Clean facility. No complaints from me" # test = "Service good, we had hummas, gyros, spiced date crumble.... all real good... need to try the flamming cheese next time!... messed up on a few tables bill.. including ours but got it fixed. I liked it. . . my guest was on the fence." test = test_element["text"] # test_list = ((test).split()) test_list = (ut.getStemmedDocuments(test_element["text"])) bigram = nltk.bigrams(test_list) bigramlist = list(map(''.join, bigram)) # test_list.extend(bigramlist) # print(test_list) results = [] for i in range(5): #check for 1 rating # i=0 py = prior[i] logr = 0 rating = i + 1 for x in test_list: word = x.strip(string.punctuation) # word = x # print(word) if word == "": continue if word in vocab_list[i]: # print(word) # print(((vocab_list[i])[word])) # print(label_count[i]) probability = (((vocab_list[i])[word]) + 1) / (label_word_count[i] + len(vocabulary)) logr += math.log(probability) else: # print("not") logr += math.log(1 / (label_word_count[i] + len(vocabulary))) for x in bigramlist: word = x.strip(string.punctuation) # word = x # print(word) if word == "": continue if word in vocab_list_bigrams[i]: # print(word) # print(((vocab_list[i])[word])) # print(label_count[i]) probability = (((vocab_list_bigrams[i])[word]) + 1) / ( label_bigram_count[i] + len(vocabulary_bigrams)) logr += math.log(probability) else: # print("not") logr += math.log( 1 / (label_bigram_count[i] + len(vocabulary_bigrams))) results.append(logr + (math.log(py))) # print("------------------------------------------") predicted_value.append(results.index(max(results)) + 1) # print(results.index(max(results))+1) ############################################################################## # print(len(predicted_value)) major = list(label_count).index(max(label_count)) + 1 correct = 0 correct_random = 0 correct_major = 0 confusion = np.zeros((5, 5)) calc_f1_score = np.zeros(5) for i in range(len(predicted_value)): # print(predicted_value[i]) if (predicted_value[i] == actual_value[i]): correct += 1 if (random_prediction[i] == actual_value[i]): correct_random += 1 if (major == actual_value[i]): correct_major += 1 confusion[predicted_value[i] - 1][actual_value[i] - 1] += 1 row_sum = np.sum(confusion, axis=1) column_sum = np.sum(confusion, axis=0) for i in range(5): precision = confusion[i][i] / row_sum[i] recall = confusion[i][i] / column_sum[i] calc_f1_score[i] = 2 * ((precision * recall) / (precision + recall)) end2 = time.time() print("Testing done, Time taken(mins)", int(end2 - start2) / 60) # print("Correct") # print(correct) # print(len(actual_value)) print("Accuracy using Naive Bayes: ", int(correct / len(actual_value) * 100), "%") print("Accuracy using Random prediciton: ", int(correct_random / len(actual_value) * 100), "%") print("Accuracy using Majority prediciton: ", int(correct_major / len(actual_value) * 100), "%") print("Confusion Matrix: ") print(confusion) print("F1 Scores:") for i in range(5): print("Label", i + 1, ": ", calc_f1_score[i]) new_confu = confusion_matrix(actual_value, predicted_value) new_f_score = f1_score(actual_value, predicted_value, average=None) print("New Confusion") print(new_confu) print("New F1_score") print(new_f_score) print(np.mean(new_f_score))
yelp_test = pd.read_json(path_test, lines=True) X_train1 = yelp_train['text'][(yelp_train['stars'] == 1)].copy() X_train2 = yelp_train['text'][(yelp_train['stars'] == 2)].copy() X_train3 = yelp_train['text'][(yelp_train['stars'] == 3)].copy() X_train4 = yelp_train['text'][(yelp_train['stars'] == 4)].copy() X_train5 = yelp_train['text'][(yelp_train['stars'] == 5)].copy() X_train = yelp_train['text'].copy() Y_train = yelp_train['stars'].copy() X_test = yelp_test['text'].copy() Y_test = yelp_test['stars'].copy() if (part == 'd'): for i in X_train1.keys(): X_train1[i] = utils.getStemmedDocuments(X_train1[i], False) for i in X_train2.keys(): X_train2[i] = utils.getStemmedDocuments(X_train2[i], False) for i in X_train3.keys(): X_train3[i] = utils.getStemmedDocuments(X_train3[i], False) for i in X_train4.keys(): X_train4[i] = utils.getStemmedDocuments(X_train4[i], False) for i in X_train5.keys(): X_train5[i] = utils.getStemmedDocuments(X_train5[i], False) for i in X_train.keys(): X_train[i] = utils.getStemmedDocuments(X_train[i], False) for i in X_test.keys(): X_test[i] = utils.getStemmedDocuments(X_test[i], False) X_merged1 = [X_train1.str.cat(sep=' ')]
def naiveBayes_d_e_f(partNum): textStemmedTrain = [] for i in textTrain: #text = re.sub(alpha, ' ', i).strip() tokens = getStemmedDocuments(i, False) textStemmedTrain.append(tokens) textStemmedTest = [] for i in textTest: #text = re.sub(alpha, ' ', i).strip() tokens = getStemmedDocuments(i, False) textStemmedTest.append(tokens) if (partNum == "d"): stars = list(sorted(set(starsTest))) print("\n Stemmed Data Training : \n") prediction = naiveBayes_a(textStemmedTrain, textStemmedTest, feature=None, accuracyTrain=True) print("F-score: ") f_score = f1_score(starsTest, prediction, average=None, labels=stars) print(f_score) f_score_avg = f1_score(starsTest, prediction, labels=stars, average='macro') print("Macro -f -Score :", f_score_avg) else: stars = list(sorted(set(starsTest))) # Part_e print("\n Bigrams Data Training : \n") prediction = naiveBayes_a(textStemmedTrain, textStemmedTest, feature="bigrams", accuracyTrain=False) print("F-score: ") f_score = f1_score(starsTest, prediction, average=None, labels=stars) print(f_score) f_score_avg = f1_score(starsTest, prediction, labels=stars, average='macro') print("Macro -f -Score :", f_score_avg) print("\n TF-IDF Data Training : \n") prediction = naiveBayes_a(textStemmedTrain, textStemmedTest, feature="tf-idf", accuracyTrain=False) stars = list(sorted(set(starsTest))) # part f print("F-score: ") f_score = f1_score(starsTest, prediction, average=None, labels=stars) print(f_score) f_score_avg = f1_score(starsTest, prediction, labels=stars, average='macro') print("Macro -f -Score :", f_score_avg)
for st in xrange(5): reviews_star[st] = voacabulary["star" + str(st)] for line in open(sys.argv[2], 'r'): data.append(json.loads(line)) data_point = json.loads(line) reviews.append(data_point["text"]) ratings.append(data_point["stars"]) # close('train.json') print "reading done" # print MakeVocab if (MakeVocab): for i in xrange(len(reviews)): # l = beautify(reviews[i]) l = utils.getStemmedDocuments(reviews[i]) star = int(float(ratings[i])) - 1 # print "Review ", i + 1 reviews_star[star] += 1 reviews_word_count[star] += len(l) if (MakeVocab): for j in xrange(len(l)): if l[j] in voacabulary: voacabulary[l[j]][star] += 1 else: d = [1 for x in range(5)] d[star] += 1 voacabulary[l[j]] = d # print i
def text_processing(text, type='None'): if (type == 'stemming'): return utility.getStemmedDocuments(text) if (type == 'lemmatize'): return lemmatize(text) return text.split()