コード例 #1
0
def splitColdStart(reviews,minRatings):
    outputSplitter = []
    userBizSplit = []
    userOnlySplit = []
    bizOnlySplit = []
    coldSplit = []
    
    # List of 4 options:
    # coldStartCalc[0] = both user and biz exist
    # coldStartCalc[1] = only user exists
    # coldStartCalc[2] = only business exists
    # coldStartCalc[3] = neither user nor business exists
    
    coldStartCalc = [0,0,0,0]
    
    bizDict = YelpPredictor.getAverageBusinessStars()
    userDict = YelpPredictor.getAverageUserStars()
    
    # get list of training reviews
    # so that we can calculate 
    #reviews = getUserBusinessRatings()
    
    for entry in reviews:
        user_id = entry[0]
        business_id = entry[1]
            
        # if user exists
        if userDict.has_key(user_id):
            userRevAry = userDict.get(user_id)
            
            # if user meets review threshold
            if userRevAry[1] >= minRatings:
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        coldStartCalc[0]+=1
                        userBizSplit.append(entry)
                    else:
                        coldStartCalc[1]+=1
                        userOnlySplit.append(entry)
                    # End IF/ELSE business and user reviews meet the threshold
                # End IF user and business exist
                else:
                    coldStartCalc[1]+=1
                    userOnlySplit.append(entry)
                # End IF/ELSE business exists given that the user exists
            # End IF user reviews meet the threshold
            else:
                # user reviews do not meet threshold
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        coldStartCalc[2]+=1
                        bizOnlySplit.append(entry)
                    else:
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    # End IF/ELSE business reviews meet threshold, user reviews do not
                # End IF user and business exist
                else:
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                # End IF/ELSE business exists given that the user review threshold not met
            # End IF/ELSE user review threshold met
        # End IF user exists
        else:
            # if business exists, but user doesn't
            if bizDict.has_key(business_id):
                bizRevAry = bizDict.get(business_id)
                    
                # check for business review threshold
                if bizRevAry[1] >= minRatings:
                    coldStartCalc[2]+=1
                    bizOnlySplit.append(entry)
                else:
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                # End IF/ELSE business review threshold met, if user doesn't exist
            # End IF business exists, but user doesn't
            else:
                coldStartCalc[3]+=1
                coldSplit.append(entry)
            # End IF/ELSE business exists when user doesn't
        # End IF/ELSE user exists
        
    # End FOR each user in training review set
    
    print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc)))
    print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc)))
    print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc)))
    print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc)))
    
    outputSplitter.append(userBizSplit)
    outputSplitter.append(userOnlySplit)
    outputSplitter.append(bizOnlySplit)
    outputSplitter.append(coldSplit)
    
    print str(len(reviews))+' input reviews'
    print str(len(userBizSplit))+' userBizSplit'
    print str(len(userOnlySplit))+' userOnlySplit'
    print str(len(bizOnlySplit))+' bizOnlySplit'
    print str(len(coldSplit))+' coldSplit'
    
    return outputSplitter
コード例 #2
0
def simulateKaggleMix(reviews,minRatings):
    outputSplitter = []
    userBizSplit = []
    userOnlySplit = []
    bizOnlySplit = []
    coldSplit = []
    
    # List of 4 options:
    # coldStartCalc[0] = both user and biz exist
    # coldStartCalc[1] = only user exists
    # coldStartCalc[2] = only business exists
    # coldStartCalc[3] = neither user nor business exists
    
    coldStartCalc = [0,0,0,0]
    
    # Calculate target #s for each scenario option.
    # First, get total number of reviews to process.
    totalReviews = len(reviews)
    # Then calculate target #s for each scenario 
    # based on analysis of final Kaggle test set
    targetCalc = [math.trunc(0.33*totalReviews),math.trunc(0.11*totalReviews),math.trunc(0.41*totalReviews),math.trunc(0.15*totalReviews)]
    # Create placeholder to accumulate reviews exceeding target numbers.
    # This will be used to re-balance after all true distributions has been completed.
    rebalance = []
    
    bizDict = YelpPredictor.getAverageBusinessStars()
    userDict = YelpPredictor.getAverageUserStars()
    
    # get list of training reviews
    # so that we can calculate 
    #reviews = getUserBusinessRatings()
    
    for entry in reviews:
        user_id = entry[0]
        business_id = entry[1]
            
        # if user exists
        if userDict.has_key(user_id):
            userRevAry = userDict.get(user_id)
            
            # if user meets review threshold
            if userRevAry[1] >= minRatings:
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        # if we have not reached the target number of reviews for user+biz scenario
                        if (coldStartCalc[0] < targetCalc[0]):
                            coldStartCalc[0]+=1
                            userBizSplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE user+biz target reviews met
                    else:
                        # if we have not reached the target number of reviews for userOnly scenario
                        if (coldStartCalc[1] < targetCalc[1]):
                            coldStartCalc[1]+=1
                            userOnlySplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE userOnly target reviews met
                    # End IF/ELSE business and user reviews meet the threshold
                # End IF user and business exist
                else:
                    # if we have not reached the target number of reviews for userOnly scenario
                    if (coldStartCalc[1] < targetCalc[1]):
                        coldStartCalc[1]+=1
                        userOnlySplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE userOnly target reviews met
                # End IF/ELSE business exists given that the user exists
            # End IF user reviews meet the threshold
            else:
                # user reviews do not meet threshold
                # if business exists    
                if bizDict.has_key(business_id):
                    bizRevAry = bizDict.get(business_id)
                    
                    # if business meets review threshold
                    if bizRevAry[1] >= minRatings:
                        # if we have not reached the target number of reviews for bizOnly scenario
                        if (coldStartCalc[2] < targetCalc[2]):
                            coldStartCalc[2]+=1
                            bizOnlySplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE bizOnly target reviews met
                    else:
                        # if we have not reached the target number of reviews for trueCold scenario
                        if (coldStartCalc[3] < targetCalc[3]):
                            coldStartCalc[3]+=1
                            coldSplit.append(entry)
                        else:
                            rebalance.append(entry)
                        # End IF/ELSE trueCold target reviews met
                    # End IF/ELSE business reviews meet threshold, user reviews do not
                # End IF user and business exist
                else:
                    # if we have not reached the target number of reviews for trueCold scenario
                    if (coldStartCalc[3] < targetCalc[3]):
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE trueCold target reviews met
                # End IF/ELSE business exists given that the user review threshold not met
            # End IF/ELSE user review threshold met
        # End IF user exists
        else:
            # if business exists, but user doesn't
            if bizDict.has_key(business_id):
                bizRevAry = bizDict.get(business_id)
                    
                # check for business review threshold
                if bizRevAry[1] >= minRatings:
                    # if we have not reached the target number of reviews for bizOnly scenario
                    if (coldStartCalc[2] < targetCalc[2]):
                        coldStartCalc[2]+=1
                        bizOnlySplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE bizOnly target reviews met
                else:
                    # if we have not reached the target number of reviews for trueCold scenario
                    if (coldStartCalc[3] < targetCalc[3]):
                        coldStartCalc[3]+=1
                        coldSplit.append(entry)
                    else:
                        rebalance.append(entry)
                    # End IF/ELSE trueCold target reviews met
                # End IF/ELSE business review threshold met, if user doesn't exist
            # End IF business exists, but user doesn't
            else:
                # if we have not reached the target number of reviews for trueCold scenario
                if (coldStartCalc[3] < targetCalc[3]):
                    coldStartCalc[3]+=1
                    coldSplit.append(entry)
                else:
                    rebalance.append(entry)
                # End IF/ELSE trueCold target reviews met
            # End IF/ELSE business exists when user doesn't
        # End IF/ELSE user exists
        
    # End FOR each user in training review set
    
    # rebalance any scenarios that were not naturally completed
    
    # userOnly
    while ((coldStartCalc[1] < targetCalc[1]) and (len(rebalance) > 0)):
        coldStartCalc[1]+=1
        userOnlySplit.append(rebalance.pop())
    # End WHILE userOnly is unbalanced and we have available rebalance stock to use
    
    # bizOnly
    while ((coldStartCalc[2] < targetCalc[2]) and (len(rebalance) > 0)):
        coldStartCalc[2]+=1
        bizOnlySplit.append(rebalance.pop())
    # End WHILE bizOnly is unbalanced and we have available rebalance stock to use
    
    # true cold start
    while ((coldStartCalc[3] < targetCalc[3]) and (len(rebalance) > 0)):
        coldStartCalc[3]+=1
        coldSplit.append(rebalance.pop())
    # End WHILE userOnly is unbalanced and we have available rebalance stock to use        
    
    
    print 'both user and biz: '+str(coldStartCalc[0])+', '+str(float(coldStartCalc[0]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[0])+', 33%'
    print 'only user: '******', '+str(float(coldStartCalc[1]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[1])+', 11%'
    print 'only biz: '+str(coldStartCalc[2])+', '+str(float(coldStartCalc[2]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[2])+', 41%'
    print 'neither: '+str(coldStartCalc[3])+', '+str(float(coldStartCalc[3]) / float(sum(coldStartCalc)))+'; Target: '+str(targetCalc[3])+', 15%'
    
    outputSplitter.append(userBizSplit)
    outputSplitter.append(userOnlySplit)
    outputSplitter.append(bizOnlySplit)
    outputSplitter.append(coldSplit)
    
    print str(len(reviews))+' input reviews'
    print str(len(userBizSplit))+' userBizSplit'
    print str(len(userOnlySplit))+' userOnlySplit'
    print str(len(bizOnlySplit))+' bizOnlySplit'
    print str(len(coldSplit))+' coldSplit'
    
    return outputSplitter