def split_review_corpus(self, document_class):
     """Split the yelp review corpus into training, dev, and test sets"""
     reviews = ReviewCorpus('yelp_reviews.json',
                            document_class=document_class)
     seed(hash("reviews"))
     shuffle(reviews)
     return (reviews[:10000], reviews[10000:11000], reviews[11000:14000])
 def split_review_corpus(self, document_class):
     """Split the yelp review corpus into training, dev, and test sets"""
     training_size = 10000
     dev_size = 1000
     test_size = 3000
     total_records = training_size + dev_size + test_size
     reviews = ReviewCorpus('yelp_reviews.json',
                            total_records,
                            document_class=document_class)
     seed(hash("reviews"))
     shuffle(reviews)
     return (reviews[:training_size],
             reviews[training_size:training_size + test_size],
             reviews[training_size + test_size:total_records],
             reviews.label_set, reviews.features_set)
Beispiel #3
0
 def split_review_corpus(self, document_class):
     reviews = ReviewCorpus('yelp_reviews.json',
                            document_class=document_class)
     seed(hash("reviews"))
     shuffle(reviews)
     return (reviews[:10000], reviews[10000:14000])
Beispiel #4
0
def get_corpus(document_class):
    reviews = ReviewCorpus('yelp_reviews.json', document_class=document_class)
    seed(hash("reviews"))
    shuffle(reviews)
    return reviews
    def features(self):
        """Trivially tokenized words."""
        return "bagofwords"

class Name(Document):
    def features(self):
        name = self.data
        return ['First=%s' % name[0], 'Last=%s' % name[-1]]

class Bigram(Document):
    def features(self):
        #different features generating mode
        return "bigram"

classifier = MaxEnt()
instances = ReviewCorpus('yelp_reviews.json',document_class=BagOfWords)

# ##experiment 1
# print('experiment 1')
# y1 = []
# x = []
# lengths = [1000,10000,50000,100000,len(instances.documents)]
# for length in lengths:
#     score = classifier.train(instances, maxlength=length, batch_size=30, l2_value=0.1, dev_instances=None)
#     print("score:",score)
#     y1.append(score)
#     x.append(str(length))
#
# plt.plot(x,y1)
# for xy in zip(x, y1):
#     plt.annotate("(%s,%s)" % xy, xy=xy, xytext=(-20, 10), textcoords='offset points')