def split_train_validation(): """ load samples from 'train' collection, draw some samples out, to use as validation set remove them from training set and insert those samples into 'validation' collection """ random.seed(999) valid_ratio = 0.3 dal = ReviewsDAL() train_ids = list(dal.load_ids("train")) total_train = len(train_ids) print "originally, there are {} reviews in train set".format(total_train) valid_ids = random.sample(train_ids,int(total_train * valid_ratio)) print "randomly draw {} samples to use as validation".format(len(valid_ids)) train_collect = dal._db['train'] valid_collect = dal._db['validate'] for index,valid_id in enumerate(valid_ids): # load from train collection cursor = train_collect.find({'_id':valid_id}) review_dict = next(cursor) # insert into validation collection valid_collect.insert_one(review_dict) # remove from train collection result = train_collect.delete_one({'_id':valid_id}) assert result.deleted_count == 1 # if index % 100 == 0: print "{} reviews transferred from train to validation".format(index+1) print "*** totally {} reviews transferred from train to validation ***".format(index+1) print "now, train set has {} reviews".format(train_collect.count({})) print "now, validation set has {} reviews".format(valid_collect.count({}))
def test_load_ids(): dal = ReviewsDAL() ids = dal.load_ids("train")