def splitColdStart(reviews,minRatings): outputSplitter = [] userBizSplit = [] userOnlySplit = [] bizOnlySplit = [] coldSplit = [] # List of 4 options: # coldStartCalc[0] = both user and biz exist # coldStartCalc[1] = only user exists # coldStartCalc[2] = only business exists # coldStartCalc[3] = neither user nor business exists coldStartCalc = [0,0,0,0] bizDict = YelpPredictor.getAverageBusinessStars() userDict = YelpPredictor.getAverageUserStars() # get list of training reviews # so that we can calculate #reviews = getUserBusinessRatings() for entry in reviews: user_id = entry[0] business_id = entry[1] # if user exists if userDict.has_key(user_id): userRevAry = userDict.get(user_id) # if user meets review threshold if userRevAry[1] >= minRatings: # if business exists if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # if business meets review threshold if bizRevAry[1] >= minRatings: coldStartCalc[0]+=1 userBizSplit.append(entry) else: coldStartCalc[1]+=1 userOnlySplit.append(entry) # End IF/ELSE business and user reviews meet the threshold # End IF user and business exist else: coldStartCalc[1]+=1 userOnlySplit.append(entry) # End IF/ELSE business exists given that the user exists # End IF user reviews meet the threshold else: # user reviews do not meet threshold # if business exists if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # if business meets review threshold if bizRevAry[1] >= minRatings: coldStartCalc[2]+=1 bizOnlySplit.append(entry) else: coldStartCalc[3]+=1 coldSplit.append(entry) # End IF/ELSE business reviews meet threshold, user reviews do not # End IF user and business exist else: coldStartCalc[3]+=1 coldSplit.append(entry) # End IF/ELSE business exists given that the user review threshold not met # End IF/ELSE user review threshold met # End IF user exists else: # if business exists, but user doesn't if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # check for business review threshold if bizRevAry[1] >= minRatings: coldStartCalc[2]+=1 bizOnlySplit.append(entry) else: coldStartCalc[3]+=1 coldSplit.append(entry) # End IF/ELSE business review threshold met, if user doesn't exist # End IF business exists, but user doesn't else: coldStartCalc[3]+=1 coldSplit.append(entry) # End IF/ELSE business exists when user doesn't # End IF/ELSE user exists # End FOR each user in training review set print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc))) print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc))) print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc))) print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc))) outputSplitter.append(userBizSplit) outputSplitter.append(userOnlySplit) outputSplitter.append(bizOnlySplit) outputSplitter.append(coldSplit) print str(len(reviews))+' input reviews' print str(len(userBizSplit))+' userBizSplit' print str(len(userOnlySplit))+' userOnlySplit' print str(len(bizOnlySplit))+' bizOnlySplit' print str(len(coldSplit))+' coldSplit' return outputSplitter
def yelpCVTester(minRatings): print 'start yelp tester' cvNum = 10 # Get map of known user->business->review/stars ratings userBizRatings = getUserBusinessRatings() # Get set of user/business pairs that have ratings # so that we can randomize the sets for # RMSE testing of different methods userBiz = getUserBizReviewTuple(userBizRatings) # Get a mapping of review_id to rating # to use in RMSE calculation # format based on final submission requirements actualRatings = getReviewStarMap(userBizRatings) # get balanced cold start scenarios coldStartScenarios = simulateKaggleMix(userBiz,minRatings) ''' #################################### MODIFY FOR TESTING COLD START COMBO #################################### ''' # Divide each scenario into RMSE testing sets # randomly select equal-sized CV groups cvGroupSize = [math.trunc(len(coldStartScenarios[0]) / cvNum), math.trunc(len(coldStartScenarios[1]) / cvNum), math.trunc(len(coldStartScenarios[2]) / cvNum), math.trunc(len(coldStartScenarios[3]) / cvNum)] # Create a set of CV groups # There will be 10 lists, each containing a list of 4 scenario sets, each containing many [uid,bid,rid] lists. # [ [ [uid_14,bid_14,rid_14], [uid267,bid_267,rid_267],... ] , [ [uid_1372,bid_1372,rid_1372],[uid_2,bid_2,rid_2],... ] , ... ] cvSet = [] # for each CV group that we want # randomly select CVGroupSize entries and pop them from the relevant for x in range(0, cvNum): # variable to hold this CV test's specific set of entries tmpCVSet = [] # for each scenario for y in range(0,len(cvGroupSize)): # variable to hold the set of scenarios for this CV test tmpCVScenario = [] # for each desired entry for z in range(0,cvGroupSize[y]): # logic is to choose a random value from the list of user/business/review tuples # associated with the relevant cold start scenario # remove it from the relevant coldStartScenarios list and add it to the current CVScenarioSet # this should be repeatable because it calculates the new length each time tmpCVScenario.append(coldStartScenarios[y].pop(random.randint(0,len(coldStartScenarios[y])-1))) # End FOR each desired entry tmpCVSet.append(tmpCVScenario) # End FOR each scenario cvSet.append(tmpCVSet) # End FOR creating each CV group ''' EVENTUALLY SPLIT RMSE CALCULATIONS SO THAT WE CAN IDENTIFY GAPS ''' # Calculate RMSE for each CV group # append test results to RMSE array rmse = [] combinedRMSE = [] splitRMSE = [[],[],[],[]] # loop through each CV in set and pass to the desired predictor method # save result in rmse[] for testSet in cvSet: #predictions = YelpPredictor.yelpTedWeights(testSet) #predictions = YelpPredictor.yelpUserCategoryAvg(testSet, 1) #predictions = YelpPredictor.yelpBizAvg(testSet, 1) #predictions = YelpPredictor.yelpUserAvg(testSet, 1) #predictions = YelpPredictor.yelpRandom(testSet) predictions = YelpPredictor.yelpColdStartSplitCV(testSet, minRatings) #tmpRMSE = getRMSE(predictions,actualRatings) # converted to get separate RMSE for each cold start scenario tmpRMSE = [] tmpUserBizRMSE = getRMSE(predictions[0],actualRatings) tmpUserRMSE = getRMSE(predictions[1],actualRatings) tmpBizRMSE = getRMSE(predictions[2],actualRatings) tmpColdRMSE = getRMSE(predictions[3],actualRatings) splitRMSE[0].append(tmpUserBizRMSE) splitRMSE[1].append(tmpUserRMSE) splitRMSE[2].append(tmpBizRMSE) splitRMSE[3].append(tmpColdRMSE) tmpRMSE.append(tmpUserBizRMSE) tmpRMSE.append(tmpUserRMSE) tmpRMSE.append(tmpBizRMSE) tmpRMSE.append(tmpColdRMSE) tmpCombinedPredictions = predictions[0].copy() tmpCombinedPredictions.update(predictions[1]) tmpCombinedPredictions.update(predictions[2]) tmpCombinedPredictions.update(predictions[3]) tmpCombinedRMSE = getRMSE(tmpCombinedPredictions,actualRatings) print str(tmpRMSE) print str(tmpCombinedRMSE) rmse.append(tmpRMSE) combinedRMSE.append(tmpCombinedRMSE) # End FOR testing each cv set # Save best, worst, standard deviation and mean RMSE bestScenario = [min(splitRMSE[0], key=float),min(splitRMSE[1], key=float),min(splitRMSE[2], key=float),min(splitRMSE[3], key=float)] worstScenario = [max(splitRMSE[0], key=float),max(splitRMSE[1], key=float),max(splitRMSE[2], key=float),max(splitRMSE[3], key=float)] meanScenario = [sum(splitRMSE[0])/float(len(splitRMSE[0])),sum(splitRMSE[1])/float(len(splitRMSE[1])),sum(splitRMSE[2])/float(len(splitRMSE[2])),sum(splitRMSE[3])/float(len(splitRMSE[3]))] sdScenario = [numpy.std(splitRMSE[0]),numpy.std(splitRMSE[1]),numpy.std(splitRMSE[2]),numpy.std(splitRMSE[3])] bestCombined = min(combinedRMSE, key=float) worstCombined = max(combinedRMSE, key=float) print 'num of CV tests: '+str(cvNum) print 'size of each CV set: '+str(cvGroupSize) print 'best scenarios: '+str(bestScenario) print 'worst scenarios: '+str(worstScenario) print 'mean scenarios: '+str(meanScenario) print 'sd scenarios: '+str(sdScenario) print 'best combined: '+str(bestCombined) print 'worst combined: '+str(worstCombined) print 'mean: '+str(sum(combinedRMSE)/float(len(combinedRMSE))) print 'sd: '+str(numpy.std(combinedRMSE)) # calculate sd and mean print 'end yelp tester'
def simulateKaggleMix(reviews,minRatings): outputSplitter = [] userBizSplit = [] userOnlySplit = [] bizOnlySplit = [] coldSplit = [] # List of 4 options: # coldStartCalc[0] = both user and biz exist # coldStartCalc[1] = only user exists # coldStartCalc[2] = only business exists # coldStartCalc[3] = neither user nor business exists coldStartCalc = [0,0,0,0] # Calculate target #s for each scenario option. # First, get total number of reviews to process. totalReviews = len(reviews) # Then calculate target #s for each scenario # based on analysis of final Kaggle test set targetCalc = [math.trunc(0.33*totalReviews),math.trunc(0.11*totalReviews),math.trunc(0.41*totalReviews),math.trunc(0.15*totalReviews)] # Create placeholder to accumulate reviews exceeding target numbers. # This will be used to re-balance after all true distributions has been completed. rebalance = [] bizDict = YelpPredictor.getAverageBusinessStars() userDict = YelpPredictor.getAverageUserStars() # get list of training reviews # so that we can calculate #reviews = getUserBusinessRatings() for entry in reviews: user_id = entry[0] business_id = entry[1] # if user exists if userDict.has_key(user_id): userRevAry = userDict.get(user_id) # if user meets review threshold if userRevAry[1] >= minRatings: # if business exists if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # if business meets review threshold if bizRevAry[1] >= minRatings: # if we have not reached the target number of reviews for user+biz scenario if (coldStartCalc[0] < targetCalc[0]): coldStartCalc[0]+=1 userBizSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE user+biz target reviews met else: # if we have not reached the target number of reviews for userOnly scenario if (coldStartCalc[1] < targetCalc[1]): coldStartCalc[1]+=1 userOnlySplit.append(entry) else: rebalance.append(entry) # End IF/ELSE userOnly target reviews met # End IF/ELSE business and user reviews meet the threshold # End IF user and business exist else: # if we have not reached the target number of reviews for userOnly scenario if (coldStartCalc[1] < targetCalc[1]): coldStartCalc[1]+=1 userOnlySplit.append(entry) else: rebalance.append(entry) # End IF/ELSE userOnly target reviews met # End IF/ELSE business exists given that the user exists # End IF user reviews meet the threshold else: # user reviews do not meet threshold # if business exists if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # if business meets review threshold if bizRevAry[1] >= minRatings: # if we have not reached the target number of reviews for bizOnly scenario if (coldStartCalc[2] < targetCalc[2]): coldStartCalc[2]+=1 bizOnlySplit.append(entry) else: rebalance.append(entry) # End IF/ELSE bizOnly target reviews met else: # if we have not reached the target number of reviews for trueCold scenario if (coldStartCalc[3] < targetCalc[3]): coldStartCalc[3]+=1 coldSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE trueCold target reviews met # End IF/ELSE business reviews meet threshold, user reviews do not # End IF user and business exist else: # if we have not reached the target number of reviews for trueCold scenario if (coldStartCalc[3] < targetCalc[3]): coldStartCalc[3]+=1 coldSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE trueCold target reviews met # End IF/ELSE business exists given that the user review threshold not met # End IF/ELSE user review threshold met # End IF user exists else: # if business exists, but user doesn't if bizDict.has_key(business_id): bizRevAry = bizDict.get(business_id) # check for business review threshold if bizRevAry[1] >= minRatings: # if we have not reached the target number of reviews for bizOnly scenario if (coldStartCalc[2] < targetCalc[2]): coldStartCalc[2]+=1 bizOnlySplit.append(entry) else: rebalance.append(entry) # End IF/ELSE bizOnly target reviews met else: # if we have not reached the target number of reviews for trueCold scenario if (coldStartCalc[3] < targetCalc[3]): coldStartCalc[3]+=1 coldSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE trueCold target reviews met # End IF/ELSE business review threshold met, if user doesn't exist # End IF business exists, but user doesn't else: # if we have not reached the target number of reviews for trueCold scenario if (coldStartCalc[3] < targetCalc[3]): coldStartCalc[3]+=1 coldSplit.append(entry) else: rebalance.append(entry) # End IF/ELSE trueCold target reviews met # End IF/ELSE business exists when user doesn't # End IF/ELSE user exists # End FOR each user in training review set # rebalance any scenarios that were not naturally completed # userOnly while ((coldStartCalc[1] < targetCalc[1]) and (len(rebalance) > 0)): coldStartCalc[1]+=1 userOnlySplit.append(rebalance.pop()) # End WHILE userOnly is unbalanced and we have available rebalance stock to use # bizOnly while ((coldStartCalc[2] < targetCalc[2]) and (len(rebalance) > 0)): coldStartCalc[2]+=1 bizOnlySplit.append(rebalance.pop()) # End WHILE bizOnly is unbalanced and we have available rebalance stock to use # true cold start while ((coldStartCalc[3] < targetCalc[3]) and (len(rebalance) > 0)): coldStartCalc[3]+=1 coldSplit.append(rebalance.pop()) # End WHILE userOnly is unbalanced and we have available rebalance stock to use print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[0])+', 33%' print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[1])+', 11%' print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[2])+', 41%' print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[3])+', 15%' outputSplitter.append(userBizSplit) outputSplitter.append(userOnlySplit) outputSplitter.append(bizOnlySplit) outputSplitter.append(coldSplit) print str(len(reviews))+' input reviews' print str(len(userBizSplit))+' userBizSplit' print str(len(userOnlySplit))+' userOnlySplit' print str(len(bizOnlySplit))+' bizOnlySplit' print str(len(coldSplit))+' coldSplit' return outputSplitter