def splitColdStart(reviews,minRatings): outputSplitter = [] userBizSplit = [] userOnlySplit = [] bizOnlySplit = [] coldSplit = [] # List of 4 options: # coldStartCalc[0] = both user and biz exist # coldStartCalc[1] = only user exists # coldStartCalc[2] = only business exists # coldStartCalc[3] = neither user nor business exists coldStartCalc = [0,0,0,0] bizDict = YelpPredictor.getAverageBusinessStars() userDict = YelpPredictor.getAverageUserStars() # get list of training reviews # so that we can calculate #reviews = getUserBusinessRatings() for entry in reviews: user_id = entry[0] business_id = entry[1] # if user exists if userDict.has_key(user_id): userRevAry = userDict.get(user_id) # if user meets review threshold if userRevAry[1] >= minRatings: # if business exists if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # if business meets review threshold if bizRevAry[1] >= minRatings: coldStartCalc[0]+=1 userBizSplit.append(entry) else: coldStartCalc[1]+=1 userOnlySplit.append(entry) # End IF/ELSE business and user reviews meet the threshold # End IF user and business exist else: coldStartCalc[1]+=1 userOnlySplit.append(entry) # End IF/ELSE business exists given that the user exists # End IF user reviews meet the threshold else: # user reviews do not meet threshold # if business exists if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # if business meets review threshold if bizRevAry[1] >= minRatings: coldStartCalc[2]+=1 bizOnlySplit.append(entry) else: coldStartCalc[3]+=1 coldSplit.append(entry) # End IF/ELSE business reviews meet threshold, user reviews do not # End IF user and business exist else: coldStartCalc[3]+=1 coldSplit.append(entry) # End IF/ELSE business exists given that the user review threshold not met # End IF/ELSE user review threshold met # End IF user exists else: # if business exists, but user doesn't if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # check for business review threshold if bizRevAry[1] >= minRatings: coldStartCalc[2]+=1 bizOnlySplit.append(entry) else: coldStartCalc[3]+=1 coldSplit.append(entry) # End IF/ELSE business review threshold met, if user doesn't exist # End IF business exists, but user doesn't else: coldStartCalc[3]+=1 coldSplit.append(entry) # End IF/ELSE business exists when user doesn't # End IF/ELSE user exists # End FOR each user in training review set print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc))) print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc))) print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc))) print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc))) outputSplitter.append(userBizSplit) outputSplitter.append(userOnlySplit) outputSplitter.append(bizOnlySplit) outputSplitter.append(coldSplit) print str(len(reviews))+' input reviews' print str(len(userBizSplit))+' userBizSplit' print str(len(userOnlySplit))+' userOnlySplit' print str(len(bizOnlySplit))+' bizOnlySplit' print str(len(coldSplit))+' coldSplit' return outputSplitter
def simulateKaggleMix(reviews,minRatings): outputSplitter = [] userBizSplit = [] userOnlySplit = [] bizOnlySplit = [] coldSplit = [] # List of 4 options: # coldStartCalc[0] = both user and biz exist # coldStartCalc[1] = only user exists # coldStartCalc[2] = only business exists # coldStartCalc[3] = neither user nor business exists coldStartCalc = [0,0,0,0] # Calculate target #s for each scenario option. # First, get total number of reviews to process. totalReviews = len(reviews) # Then calculate target #s for each scenario # based on analysis of final Kaggle test set targetCalc = [math.trunc(0.33*totalReviews),math.trunc(0.11*totalReviews),math.trunc(0.41*totalReviews),math.trunc(0.15*totalReviews)] # Create placeholder to accumulate reviews exceeding target numbers. # This will be used to re-balance after all true distributions has been completed. rebalance = [] bizDict = YelpPredictor.getAverageBusinessStars() userDict = YelpPredictor.getAverageUserStars() # get list of training reviews # so that we can calculate #reviews = getUserBusinessRatings() for entry in reviews: user_id = entry[0] business_id = entry[1] # if user exists if userDict.has_key(user_id): userRevAry = userDict.get(user_id) # if user meets review threshold if userRevAry[1] >= minRatings: # if business exists if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # if business meets review threshold if bizRevAry[1] >= minRatings: # if we have not reached the target number of reviews for user+biz scenario if (coldStartCalc[0] < targetCalc[0]): coldStartCalc[0]+=1 userBizSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE user+biz target reviews met else: # if we have not reached the target number of reviews for userOnly scenario if (coldStartCalc[1] < targetCalc[1]): coldStartCalc[1]+=1 userOnlySplit.append(entry) else: rebalance.append(entry) # End IF/ELSE userOnly target reviews met # End IF/ELSE business and user reviews meet the threshold # End IF user and business exist else: # if we have not reached the target number of reviews for userOnly scenario if (coldStartCalc[1] < targetCalc[1]): coldStartCalc[1]+=1 userOnlySplit.append(entry) else: rebalance.append(entry) # End IF/ELSE userOnly target reviews met # End IF/ELSE business exists given that the user exists # End IF user reviews meet the threshold else: # user reviews do not meet threshold # if business exists if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # if business meets review threshold if bizRevAry[1] >= minRatings: # if we have not reached the target number of reviews for bizOnly scenario if (coldStartCalc[2] < targetCalc[2]): coldStartCalc[2]+=1 bizOnlySplit.append(entry) else: rebalance.append(entry) # End IF/ELSE bizOnly target reviews met else: # if we have not reached the target number of reviews for trueCold scenario if (coldStartCalc[3] < targetCalc[3]): coldStartCalc[3]+=1 coldSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE trueCold target reviews met # End IF/ELSE business reviews meet threshold, user reviews do not # End IF user and business exist else: # if we have not reached the target number of reviews for trueCold scenario if (coldStartCalc[3] < targetCalc[3]): coldStartCalc[3]+=1 coldSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE trueCold target reviews met # End IF/ELSE business exists given that the user review threshold not met # End IF/ELSE user review threshold met # End IF user exists else: # if business exists, but user doesn't if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # check for business review threshold if bizRevAry[1] >= minRatings: # if we have not reached the target number of reviews for bizOnly scenario if (coldStartCalc[2] < targetCalc[2]): coldStartCalc[2]+=1 bizOnlySplit.append(entry) else: rebalance.append(entry) # End IF/ELSE bizOnly target reviews met else: # if we have not reached the target number of reviews for trueCold scenario if (coldStartCalc[3] < targetCalc[3]): coldStartCalc[3]+=1 coldSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE trueCold target reviews met # End IF/ELSE business review threshold met, if user doesn't exist # End IF business exists, but user doesn't else: # if we have not reached the target number of reviews for trueCold scenario if (coldStartCalc[3] < targetCalc[3]): coldStartCalc[3]+=1 coldSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE trueCold target reviews met # End IF/ELSE business exists when user doesn't # End IF/ELSE user exists # End FOR each user in training review set # rebalance any scenarios that were not naturally completed # userOnly while ((coldStartCalc[1] < targetCalc[1]) and (len(rebalance) > 0)): coldStartCalc[1]+=1 userOnlySplit.append(rebalance.pop()) # End WHILE userOnly is unbalanced and we have available rebalance stock to use # bizOnly while ((coldStartCalc[2] < targetCalc[2]) and (len(rebalance) > 0)): coldStartCalc[2]+=1 bizOnlySplit.append(rebalance.pop()) # End WHILE bizOnly is unbalanced and we have available rebalance stock to use # true cold start while ((coldStartCalc[3] < targetCalc[3]) and (len(rebalance) > 0)): coldStartCalc[3]+=1 coldSplit.append(rebalance.pop()) # End WHILE userOnly is unbalanced and we have available rebalance stock to use print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[0])+', 33%' print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[1])+', 11%' print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[2])+', 41%' print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[3])+', 15%' outputSplitter.append(userBizSplit) outputSplitter.append(userOnlySplit) outputSplitter.append(bizOnlySplit) outputSplitter.append(coldSplit) print str(len(reviews))+' input reviews' print str(len(userBizSplit))+' userBizSplit' print str(len(userOnlySplit))+' userOnlySplit' print str(len(bizOnlySplit))+' bizOnlySplit' print str(len(coldSplit))+' coldSplit' return outputSplitter