Ejemplo n.º 1
0
 def from_dict(d):
     r = Review()
     r.id = d.get("_id",None)
     r.business_id = d.get("business_id",None)# sometimes, we just don't care business_id
     r.ratings = d.get("ratings",None)
     r.sentences = [Sentence.from_dict(sent_dict) for sent_dict in d["sentences"]]
     return r
def load_sentences():
    dbname = "tripadvisor_train"
    client = MongoClient()
    db = client[dbname]
    sentisent_collection = db["sentiment_sentences"]

    # cursor = sentisent_collection.find({'sentiment':{'$lte':2}}).skip(99).limit(120)
    cursor = sentisent_collection.aggregate([ {'$match':{'sentiment':3}},
                                              { '$sample': { 'size': 120 } } ])
    for index,sentd in enumerate(cursor):
        sent = Sentence.from_dict(sentd)
        print "\n\n[{}] Aspect: {}, Sentiment: {}".format(index+1,sent.aspect,sent.sentiment)
        print sent.raw
        print "--------------"
        print sent.words

    client.close()
def sample_split(dbname,num_train,num_test):
    client = MongoClient()
    db = client[dbname]
    sentisent_collection = db.sentiment_sentences

    ################## load and count
    aspect_dist = nltk.FreqDist()
    sentiment_dist = nltk.FreqDist()

    all_samples = []
    cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train  + num_test } } ])
    for index,d in enumerate(cursor):
        sent = Sentence.from_dict(d)
        all_samples.append( (sent.words,sent.sentiment) )

        aspect_dist[sent.aspect] +=1
        sentiment_dist[int(sent.sentiment)] +=1
    client.close()

    ################## show statistics
    for k in aspect_dist:
        print '[{}]: {}'.format(k,aspect_dist.freq(k))

    for k in sentiment_dist:
        print '[{}]: {}'.format(k,sentiment_dist.freq(k))

    ################## shuffle
    random.shuffle(all_samples)

    ################## split
    def __dump(filename,data):
        with open(filename,"wb") as outf:
            cPickle.dump(data,outf)

    __dump("sentidata_train_raw.pkl",all_samples[:num_train])
    __dump("sentidata_test_raw.pkl",all_samples[num_train:])