Exemple #1
0
def read_train(filename, stem=False, bigram=False):
    global doc_data, cnt, size_voc
    count = [0] * 5
    cnt = 0
    for i in range(5):
        class_dict[i][0] = Counter()
    for doc in utils.json_reader(filename):
        #txt = word_tokenize(doc["text"].lower())
        txt = re.findall(r"[\w']+", doc["text"].lower())
        #txt = doc["text"].split(" ")
        if stem == True:
            s = " "
            txt = utils.getStemmedDocuments(s.join(txt))
            txt = [item for item in txt if not item.isdigit()]
        if bigram == True:
            txt = (list(nltk.bigrams(txt)))
        vocab.update(txt)
        cnt = cnt + 1
        class_dict[int(doc["stars"]) - 1][0].update(txt)
        class_dict[int(doc["stars"]) - 1][2] += 1
        doc_data.append([doc["stars"], Counter(txt)])

    for i in range(5):
        class_dict[i][1] = sum(class_dict[i][0].values())
        print class_dict[i][1], class_dict[i][2]
    print "vocab"
    print len(vocab)
    size_voc = len(vocab)
Exemple #2
0
def read_t(filename, stem=False, bigram=False):
    global doc_data
    for doc in utils.json_reader(filename):
        #txt = word_tokenize(doc["text"].lower())
        txt = re.findall(r"[\w']+", doc["text"].lower())
        #txt = doc["text"].split(" ")
        if stem == True:
            s = " "
            txt = utils.getStemmedDocuments(s.join(txt))
            txt = [item for item in txt if not item.isdigit()]
        if bigram == True:
            txt = (list(nltk.bigrams(txt)))
        doc_data.append([doc["stars"], Counter(txt)])
Exemple #3
0
def main():
    # test = "Stopped here today to give it a try and must admit the food was excellent"
    # bigram = nltk.bigrams(test.split())
    # print(list(map(''.join, bigram)))

    #Making Vocabulary for different labels out of training data
    vocab_list = [{}, {}, {}, {}, {}]
    vocabulary = {}
    vocab_list_bigrams = [{}, {}, {}, {}, {}]
    vocabulary_bigrams = {}
    #Count of each label in training data
    label_count = np.zeros(5)
    label_word_count = np.zeros(5)
    label_bigram_count = np.zeros(5)

    start1 = time.time()
    ##############################################################################
    #Training part
    iter = (ut.json_reader(train))
    # for i in range(TRAINFULLSIZE):
    i1 = 0
    for element in iter:
        i1 += 1
        if (i1 % 1000) == 0:
            print("Training: ", i1 / 1000)
        # for i in range(1):
        # element = next(iter)
        label_count[int(element["stars"]) - 1] += 1
        # print((remove_duplicates((element["text"]).split())))
        label_word_count[int(element["stars"]) - 1] += len(
            (element["text"]).split())
        # Switch these lines for stemming
        stemmed = ut.getStemmedDocuments(element["text"])
        bigram = nltk.bigrams(stemmed)
        bigramlist = list(map(''.join, bigram))

        label_word_count[int(element["stars"]) - 1] += len(stemmed)
        label_bigram_count[int(element["stars"]) - 1] += len(bigramlist)

        # stemmed.extend(bigramlist)
        # print(stemmed)
        for x in (stemmed):
            # for x in ((element["text"]).split()):
            word = x.strip(string.punctuation)
            # word = x
            # print(word)
            if word == "":
                continue
            if word in vocab_list[int(element["stars"] - 1)]:
                (vocab_list[int(element["stars"]) - 1])[word] += 1
            else:
                (vocab_list[int(element["stars"]) - 1])[word] = 1

            vocabulary[word] = 1

        for x in (bigramlist):
            # for x in ((element["text"]).split()):
            word = x.strip(string.punctuation)
            # word = x
            # print(word)
            if word == "":
                continue
            if word in vocab_list_bigrams[int(element["stars"] - 1)]:
                (vocab_list_bigrams[int(element["stars"]) - 1])[word] += 1
            else:
                (vocab_list_bigrams[int(element["stars"]) - 1])[word] = 1

            vocabulary_bigrams[word] = 1

##############################################################################

    end1 = time.time()
    print("Training done, Time taken(mins)", int(end1 - start1) / 60)

    # print(len(vocab))
    # count=0;
    # for i in range(5):
    # 	print(label_count[i])
    # 	count+=(label_count[i])
    # print(count)
    prior = label_count / TRAINSIZE
    # print(prior)

    actual_value = []
    predicted_value = []
    random_prediction = []
    start2 = time.time()
    ##############################################################################
    #TESTING
    i2 = 0
    iter2 = (ut.json_reader(test))
    for test_element in iter2:
        i2 += 1
        if (i2 % 1000) == 0:
            print("Testing: ", i2 / 1000)
        # print(i)
        #Random number between 1-5
        random_prediction.append(random.randint(1, 6))
        # test_element = next(iter2)
        actual_value.append(int(test_element["stars"]))
        # test = "Stopped here today to give it a try and must admit the food was excellent. I ordered the vegetarian Soyrizo (fake sausage) burrito and fell in love. It was well worth the $6. It's not like the big chain restaurants where they serve you a massive sloppy burrito. It was the perfect size and easily handled. \nIt's small and quaint, with some seating outside in under a canopy. The owners were a lovely couple, passionate about their food. \nExcellent."
        # test = "Fast, easy, helpful. In and out quickly and got the medicine I needed. Smart staff who was kind and helpful. Clean facility. No complaints from me"
        # test = "Service good, we had hummas, gyros, spiced date crumble.... all real good... need to try the flamming cheese next time!...  messed up on a few tables bill.. including ours but got it fixed.  I liked it. . .  my guest was on the fence."
        test = test_element["text"]
        # test_list = ((test).split())
        test_list = (ut.getStemmedDocuments(test_element["text"]))
        bigram = nltk.bigrams(test_list)
        bigramlist = list(map(''.join, bigram))
        # test_list.extend(bigramlist)
        # print(test_list)
        results = []
        for i in range(5):
            #check for 1 rating
            # i=0
            py = prior[i]
            logr = 0
            rating = i + 1
            for x in test_list:
                word = x.strip(string.punctuation)
                # word = x
                # print(word)
                if word == "":
                    continue
                if word in vocab_list[i]:
                    # print(word)
                    # print(((vocab_list[i])[word]))
                    # print(label_count[i])
                    probability = (((vocab_list[i])[word]) +
                                   1) / (label_word_count[i] + len(vocabulary))
                    logr += math.log(probability)
                else:
                    # print("not")
                    logr += math.log(1 /
                                     (label_word_count[i] + len(vocabulary)))

            for x in bigramlist:
                word = x.strip(string.punctuation)
                # word = x
                # print(word)
                if word == "":
                    continue
                if word in vocab_list_bigrams[i]:
                    # print(word)
                    # print(((vocab_list[i])[word]))
                    # print(label_count[i])
                    probability = (((vocab_list_bigrams[i])[word]) + 1) / (
                        label_bigram_count[i] + len(vocabulary_bigrams))
                    logr += math.log(probability)
                else:
                    # print("not")
                    logr += math.log(
                        1 / (label_bigram_count[i] + len(vocabulary_bigrams)))
            results.append(logr + (math.log(py)))
            # print("------------------------------------------")

        predicted_value.append(results.index(max(results)) + 1)
        # print(results.index(max(results))+1)


##############################################################################

# print(len(predicted_value))

    major = list(label_count).index(max(label_count)) + 1
    correct = 0
    correct_random = 0
    correct_major = 0
    confusion = np.zeros((5, 5))
    calc_f1_score = np.zeros(5)

    for i in range(len(predicted_value)):
        # print(predicted_value[i])
        if (predicted_value[i] == actual_value[i]):
            correct += 1
        if (random_prediction[i] == actual_value[i]):
            correct_random += 1
        if (major == actual_value[i]):
            correct_major += 1
        confusion[predicted_value[i] - 1][actual_value[i] - 1] += 1

    row_sum = np.sum(confusion, axis=1)
    column_sum = np.sum(confusion, axis=0)
    for i in range(5):
        precision = confusion[i][i] / row_sum[i]
        recall = confusion[i][i] / column_sum[i]
        calc_f1_score[i] = 2 * ((precision * recall) / (precision + recall))

    end2 = time.time()
    print("Testing done, Time taken(mins)", int(end2 - start2) / 60)

    # print("Correct")
    # print(correct)
    # print(len(actual_value))
    print("Accuracy using Naive Bayes: ",
          int(correct / len(actual_value) * 100), "%")
    print("Accuracy using Random prediciton: ",
          int(correct_random / len(actual_value) * 100), "%")
    print("Accuracy using Majority prediciton: ",
          int(correct_major / len(actual_value) * 100), "%")
    print("Confusion Matrix: ")
    print(confusion)
    print("F1 Scores:")
    for i in range(5):
        print("Label", i + 1, ": ", calc_f1_score[i])

    new_confu = confusion_matrix(actual_value, predicted_value)
    new_f_score = f1_score(actual_value, predicted_value, average=None)
    print("New Confusion")
    print(new_confu)
    print("New F1_score")
    print(new_f_score)
    print(np.mean(new_f_score))
Exemple #4
0
yelp_test = pd.read_json(path_test, lines=True)

X_train1 = yelp_train['text'][(yelp_train['stars'] == 1)].copy()
X_train2 = yelp_train['text'][(yelp_train['stars'] == 2)].copy()
X_train3 = yelp_train['text'][(yelp_train['stars'] == 3)].copy()
X_train4 = yelp_train['text'][(yelp_train['stars'] == 4)].copy()
X_train5 = yelp_train['text'][(yelp_train['stars'] == 5)].copy()

X_train = yelp_train['text'].copy()
Y_train = yelp_train['stars'].copy()
X_test = yelp_test['text'].copy()
Y_test = yelp_test['stars'].copy()

if (part == 'd'):
    for i in X_train1.keys():
        X_train1[i] = utils.getStemmedDocuments(X_train1[i], False)
    for i in X_train2.keys():
        X_train2[i] = utils.getStemmedDocuments(X_train2[i], False)
    for i in X_train3.keys():
        X_train3[i] = utils.getStemmedDocuments(X_train3[i], False)
    for i in X_train4.keys():
        X_train4[i] = utils.getStemmedDocuments(X_train4[i], False)
    for i in X_train5.keys():
        X_train5[i] = utils.getStemmedDocuments(X_train5[i], False)

    for i in X_train.keys():
        X_train[i] = utils.getStemmedDocuments(X_train[i], False)
    for i in X_test.keys():
        X_test[i] = utils.getStemmedDocuments(X_test[i], False)

X_merged1 = [X_train1.str.cat(sep=' ')]
def naiveBayes_d_e_f(partNum):
    textStemmedTrain = []
    for i in textTrain:
        #text = re.sub(alpha, ' ', i).strip()
        tokens = getStemmedDocuments(i, False)
        textStemmedTrain.append(tokens)

    textStemmedTest = []
    for i in textTest:
        #text = re.sub(alpha, ' ', i).strip()
        tokens = getStemmedDocuments(i, False)
        textStemmedTest.append(tokens)
    if (partNum == "d"):
        stars = list(sorted(set(starsTest)))
        print("\n Stemmed Data Training : \n")
        prediction = naiveBayes_a(textStemmedTrain,
                                  textStemmedTest,
                                  feature=None,
                                  accuracyTrain=True)
        print("F-score: ")
        f_score = f1_score(starsTest, prediction, average=None, labels=stars)
        print(f_score)
        f_score_avg = f1_score(starsTest,
                               prediction,
                               labels=stars,
                               average='macro')
        print("Macro -f -Score :", f_score_avg)

    else:
        stars = list(sorted(set(starsTest)))
        # Part_e
        print("\n Bigrams Data Training : \n")
        prediction = naiveBayes_a(textStemmedTrain,
                                  textStemmedTest,
                                  feature="bigrams",
                                  accuracyTrain=False)
        print("F-score: ")
        f_score = f1_score(starsTest, prediction, average=None, labels=stars)
        print(f_score)

        f_score_avg = f1_score(starsTest,
                               prediction,
                               labels=stars,
                               average='macro')
        print("Macro -f -Score :", f_score_avg)

        print("\n TF-IDF Data Training : \n")
        prediction = naiveBayes_a(textStemmedTrain,
                                  textStemmedTest,
                                  feature="tf-idf",
                                  accuracyTrain=False)

        stars = list(sorted(set(starsTest)))

        # part f
        print("F-score: ")
        f_score = f1_score(starsTest, prediction, average=None, labels=stars)
        print(f_score)

        f_score_avg = f1_score(starsTest,
                               prediction,
                               labels=stars,
                               average='macro')
        print("Macro -f -Score :", f_score_avg)
Exemple #6
0
        for st in xrange(5):
            reviews_star[st] = voacabulary["star" + str(st)]

    for line in open(sys.argv[2], 'r'):
        data.append(json.loads(line))
        data_point = json.loads(line)
        reviews.append(data_point["text"])
        ratings.append(data_point["stars"])

    # close('train.json')
    print "reading done"
    # print MakeVocab
    if (MakeVocab):
        for i in xrange(len(reviews)):
            # l =  beautify(reviews[i])
            l = utils.getStemmedDocuments(reviews[i])

            star = int(float(ratings[i])) - 1
            # print "Review ", i + 1
            reviews_star[star] += 1
            reviews_word_count[star] += len(l)

            if (MakeVocab):
                for j in xrange(len(l)):
                    if l[j] in voacabulary:
                        voacabulary[l[j]][star] += 1
                    else:
                        d = [1 for x in range(5)]
                        d[star] += 1
                        voacabulary[l[j]] = d
                # print i
Exemple #7
0
def text_processing(text, type='None'):
    if (type == 'stemming'):
        return utility.getStemmedDocuments(text)
    if (type == 'lemmatize'):
        return lemmatize(text)
    return text.split()