def from_dict(d): r = Review() r.id = d.get("_id",None) r.business_id = d.get("business_id",None)# sometimes, we just don't care business_id r.ratings = d.get("ratings",None) r.sentences = [Sentence.from_dict(sent_dict) for sent_dict in d["sentences"]] return r
def load_sentences(): dbname = "tripadvisor_train" client = MongoClient() db = client[dbname] sentisent_collection = db["sentiment_sentences"] # cursor = sentisent_collection.find({'sentiment':{'$lte':2}}).skip(99).limit(120) cursor = sentisent_collection.aggregate([ {'$match':{'sentiment':3}}, { '$sample': { 'size': 120 } } ]) for index,sentd in enumerate(cursor): sent = Sentence.from_dict(sentd) print "\n\n[{}] Aspect: {}, Sentiment: {}".format(index+1,sent.aspect,sent.sentiment) print sent.raw print "--------------" print sent.words client.close()
def sample_split(dbname,num_train,num_test): client = MongoClient() db = client[dbname] sentisent_collection = db.sentiment_sentences ################## load and count aspect_dist = nltk.FreqDist() sentiment_dist = nltk.FreqDist() all_samples = [] cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train + num_test } } ]) for index,d in enumerate(cursor): sent = Sentence.from_dict(d) all_samples.append( (sent.words,sent.sentiment) ) aspect_dist[sent.aspect] +=1 sentiment_dist[int(sent.sentiment)] +=1 client.close() ################## show statistics for k in aspect_dist: print '[{}]: {}'.format(k,aspect_dist.freq(k)) for k in sentiment_dist: print '[{}]: {}'.format(k,sentiment_dist.freq(k)) ################## shuffle random.shuffle(all_samples) ################## split def __dump(filename,data): with open(filename,"wb") as outf: cPickle.dump(data,outf) __dump("sentidata_train_raw.pkl",all_samples[:num_train]) __dump("sentidata_test_raw.pkl",all_samples[num_train:])