Exemple #1
0
def main():


    reviews = yelp_data.getReviewsTokenized()

    # use config file. Example can be seen in 
    # naive-config.json. Uses configuration to run
    # multiple tests at the same time. 
    with open(sys.argv[1], 'rb') as f:
        config = json.loads(f.read())
        for test_group in config["tests"]:
            WriteEntryHeader(test_group)
            for i, entry in enumerate(test_group["entries"]):
                WriteResultHeader(entry)
                maxN = entry["N"]
                training_set = reviews[int(entry["training"]["start"]):int(entry["training"]["end"])]
                test_set     = reviews[int(entry["test"]["start"]):int(entry["test"]["end"])]
                vocab = yelp_data.buildVocab(training_set)
                training_set_prep = yelp_data.preProcessN(training_set, vocab, maxN)
                test_set_prep = yelp_data.preProcessN(test_set, vocab, maxN)
                naiveBayes = NaiveBayes(vocab)
                naiveBayes.Train(training_set_prep, maxN)
                # Test accuracy
                total = 0.0
                right = 0.0
                false_pos = 0.0
                false_neg = 0.0
                for review in test_set_prep:
                    total += 1.0

                    if entry["backoff"]:
                        is_positive = naiveBayes.PredictPositiveStupidBackoff(review, maxN)
                    else:
                        is_positive = naiveBayes.PredictPositive(review, maxN)

                    if review[STARS] in naiveBayes.positive and is_positive:
                        right += 1.0
                    elif review[STARS] in naiveBayes.positive and not is_positive:
                        false_neg += 1.0
                    elif review[STARS] in naiveBayes.negative and not is_positive:
                        right += 1.0
                    elif review[STARS] in naiveBayes.negative and is_positive:
                        false_pos += 1.0

                WriteResult(total, right, false_pos, false_neg)
Exemple #2
0
def main():
    N = 3
    (reviews, nlp) = yelp_data.getReviewsTokenizedandTagged(1000)
    training_set = reviews[0:900]
    test_set = reviews[900:1000]
    vocab = yelp_data.buildVocab(training_set)
    training_set_prep = yelp_data.preProcess(training_set, vocab)
    test_set_prep = yelp_data.preProcess(test_set, vocab)

    ngrams = Ngrams(nlp)
    ngrams.Train(training_set_prep, N)
    ngrams.CalculateNgramPMI(2800,
                             2)  #Select the k POS bigrams with the highest PMI
    ngrams.CalculateNgramPMI(
        2800, 3)  #Select the k POS trigrams with the highest PMI

    me = Maxent(vocab, nlp)
    me.buildChunks(training_set_prep)
    me.buildFeatures(ngrams, N)
    me.buildARFFfile(training_set_prep, "yelp_maxent_training.arff", N)
    me.buildARFFfile(test_set_prep, "yelp_maxent_test.arff", N)
Exemple #3
0
def main():
    N = 3
    (reviews, nlp) = yelp_data.getReviewsTokenizedandTagged(1000)
    training_set = reviews[0:900]
    test_set     = reviews[900:1000]
    vocab = yelp_data.buildVocab(training_set)
    training_set_prep = yelp_data.preProcess(training_set, vocab)
    test_set_prep = yelp_data.preProcess(test_set, vocab)
    
    ngrams = Ngrams(nlp)
    ngrams.Train(training_set_prep, N)
    ngrams.CalculateNgramPMI(2800, 2) #Select the k POS bigrams with the highest PMI
    ngrams.CalculateNgramPMI(2800, 3) #Select the k POS trigrams with the highest PMI


    
    me = Maxent(vocab, nlp)
    me.buildChunks(training_set_prep) 
    me.buildFeatures(ngrams, N)
    me.buildARFFfile(training_set_prep, "yelp_maxent_training.arff", N)
    me.buildARFFfile(test_set_prep, "yelp_maxent_test.arff", N)