def main(): reviews = yelp_data.getReviewsTokenized() # use config file. Example can be seen in # naive-config.json. Uses configuration to run # multiple tests at the same time. with open(sys.argv[1], 'rb') as f: config = json.loads(f.read()) for test_group in config["tests"]: WriteEntryHeader(test_group) for i, entry in enumerate(test_group["entries"]): WriteResultHeader(entry) maxN = entry["N"] training_set = reviews[int(entry["training"]["start"]):int(entry["training"]["end"])] test_set = reviews[int(entry["test"]["start"]):int(entry["test"]["end"])] vocab = yelp_data.buildVocab(training_set) training_set_prep = yelp_data.preProcessN(training_set, vocab, maxN) test_set_prep = yelp_data.preProcessN(test_set, vocab, maxN) naiveBayes = NaiveBayes(vocab) naiveBayes.Train(training_set_prep, maxN) # Test accuracy total = 0.0 right = 0.0 false_pos = 0.0 false_neg = 0.0 for review in test_set_prep: total += 1.0 if entry["backoff"]: is_positive = naiveBayes.PredictPositiveStupidBackoff(review, maxN) else: is_positive = naiveBayes.PredictPositive(review, maxN) if review[STARS] in naiveBayes.positive and is_positive: right += 1.0 elif review[STARS] in naiveBayes.positive and not is_positive: false_neg += 1.0 elif review[STARS] in naiveBayes.negative and not is_positive: right += 1.0 elif review[STARS] in naiveBayes.negative and is_positive: false_pos += 1.0 WriteResult(total, right, false_pos, false_neg)
def main(): N = 3 (reviews, nlp) = yelp_data.getReviewsTokenizedandTagged(1000) training_set = reviews[0:900] test_set = reviews[900:1000] vocab = yelp_data.buildVocab(training_set) training_set_prep = yelp_data.preProcess(training_set, vocab) test_set_prep = yelp_data.preProcess(test_set, vocab) ngrams = Ngrams(nlp) ngrams.Train(training_set_prep, N) ngrams.CalculateNgramPMI(2800, 2) #Select the k POS bigrams with the highest PMI ngrams.CalculateNgramPMI( 2800, 3) #Select the k POS trigrams with the highest PMI me = Maxent(vocab, nlp) me.buildChunks(training_set_prep) me.buildFeatures(ngrams, N) me.buildARFFfile(training_set_prep, "yelp_maxent_training.arff", N) me.buildARFFfile(test_set_prep, "yelp_maxent_test.arff", N)
def main(): N = 3 (reviews, nlp) = yelp_data.getReviewsTokenizedandTagged(1000) training_set = reviews[0:900] test_set = reviews[900:1000] vocab = yelp_data.buildVocab(training_set) training_set_prep = yelp_data.preProcess(training_set, vocab) test_set_prep = yelp_data.preProcess(test_set, vocab) ngrams = Ngrams(nlp) ngrams.Train(training_set_prep, N) ngrams.CalculateNgramPMI(2800, 2) #Select the k POS bigrams with the highest PMI ngrams.CalculateNgramPMI(2800, 3) #Select the k POS trigrams with the highest PMI me = Maxent(vocab, nlp) me.buildChunks(training_set_prep) me.buildFeatures(ngrams, N) me.buildARFFfile(training_set_prep, "yelp_maxent_training.arff", N) me.buildARFFfile(test_set_prep, "yelp_maxent_test.arff", N)