def split_review_corpus(self, document_class): """Split the yelp review corpus into training, dev, and test sets""" reviews = ReviewCorpus('yelp_reviews.json', document_class=document_class) seed(hash("reviews")) shuffle(reviews) return (reviews[:10000], reviews[10000:11000], reviews[11000:14000])
def split_review_corpus(self, document_class): """Split the yelp review corpus into training, dev, and test sets""" training_size = 10000 dev_size = 1000 test_size = 3000 total_records = training_size + dev_size + test_size reviews = ReviewCorpus('yelp_reviews.json', total_records, document_class=document_class) seed(hash("reviews")) shuffle(reviews) return (reviews[:training_size], reviews[training_size:training_size + test_size], reviews[training_size + test_size:total_records], reviews.label_set, reviews.features_set)
def split_review_corpus(self, document_class): reviews = ReviewCorpus('yelp_reviews.json', document_class=document_class) seed(hash("reviews")) shuffle(reviews) return (reviews[:10000], reviews[10000:14000])
def get_corpus(document_class): reviews = ReviewCorpus('yelp_reviews.json', document_class=document_class) seed(hash("reviews")) shuffle(reviews) return reviews
def features(self): """Trivially tokenized words.""" return "bagofwords" class Name(Document): def features(self): name = self.data return ['First=%s' % name[0], 'Last=%s' % name[-1]] class Bigram(Document): def features(self): #different features generating mode return "bigram" classifier = MaxEnt() instances = ReviewCorpus('yelp_reviews.json',document_class=BagOfWords) # ##experiment 1 # print('experiment 1') # y1 = [] # x = [] # lengths = [1000,10000,50000,100000,len(instances.documents)] # for length in lengths: # score = classifier.train(instances, maxlength=length, batch_size=30, l2_value=0.1, dev_instances=None) # print("score:",score) # y1.append(score) # x.append(str(length)) # # plt.plot(x,y1) # for xy in zip(x, y1): # plt.annotate("(%s,%s)" % xy, xy=xy, xytext=(-20, 10), textcoords='offset points')