test_text.append(txt) #print test_files print len(test_files) total_train_files = [] TOTAL = INCREMENT UPPER_LIMIT = 500 while len(total_train_files) < UPPER_LIMIT: total_train_files = train_files[:TOTAL] data_set_corpus = PlaintextCorpusReader(sys.argv[1], total_train_files) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(3, data_set_corpus.words(), estimator) #lm = NgramModel(2, data_set_corpus.words(), estimator) P = [] for s in test_text: s_tokens = nltk.word_tokenize(s) if SENTENCE: #if len(s_tokens) > 3: if len(s_tokens) > 10: p = lm.perplexity(s_tokens) P.append(p) else: p = lm.perplexity(s_tokens) P.append(p) TOTAL += INCREMENT print "%d %f" % (len(total_train_files), sum(P) / len(P))
print "Negative unigram model complete." neg_bigram_lm = NgramModel(2, neg_movie_reviews.words(), estimator) print "Negative bigram model complete." #neg_trigram_lm = NgramModel(3, neg_movie_reviews.words(), estimator) #read in the tweets tweets = [] tokenizer = utils.Tokenizer() neg_review_higher = 0 pos_review_higher = 0 with open(sys.argv[2], 'r') as tweets_file: tweets.extend(tweets_file.readlines()) for tweet in tweets: tokens = tokenizer.tokenize(tweet) pu = pos_unigram_lm.perplexity(tokens) nu = neg_unigram_lm.perplexity(tokens) pb = pos_bigram_lm.perplexity(tokens) nb = neg_bigram_lm.perplexity(tokens) #pt = pos_trigram_lm.perplexity(tokens) #nt = neg_trigram_lm.perplexity(tokens) #print pu, nu, pb, nb, pt, nt #print pu, nu line = "" if pu > nu: pos_review_higher += 1 line += "9002:1"