def load_user_data(): """ 从数据库中读取用户数据到内存中 (不建议常用,耗时约45分钟) :return:dict """ mongodb_utils = MongodbUtils('10.108.192.165', 27017) db = mongodb_utils.get_db() userdb = db.train_user user_item_data = {} for user_id in userdb.distinct("user_id"): user_item = {} user_item_info = {} behavior_count = 0 for behavior in userdb.find({"user_id": user_id}): behavior_count += 1 item_id = behavior['item_id'] if user_item.has_key(item_id): user_item[item_id] += 1 else: user_item[item_id] = 1 user_item_info['behavior_count'] = behavior_count user_item_info['items'] = user_item user_item_data[user_id] = user_item_info logger.info(user_id + ' done!') return user_item_data
def get_predict_vecdata(set_timerange=('2014-12-18', '2014-12-19'), behavior_timerange=('2014-12-12', '2014-12-19'), predict_set_path='%s/predict/predict_set.csv' % (data_path), predict_vectail_path='%s/predict/predict_vectail.csv' % (data_path), csv_output_path='%s/predict/combined_vec_data.csv' % (data_path), svm_output_path='%s/predict/svmdata.dat' % (data_path)): """ 生成预测集,需要制定时间范围与各输出路径 :param set_timerange: 潜在购买商品的时间范围 :param behavior_timerange: 计算向量值时的时间范围 """ from data_preprocess import generate_userset import MySQLdb from data_preprocess.MongoDB_Utils import MongodbUtils logger.info('start get_predict_vecdata, set_timerange=%s to %s, behavior_timerange = %s to %s' % ( set_timerange[0], set_timerange[1], behavior_timerange[0], behavior_timerange[1])) connect = MySQLdb.connect(host='10.108.192.119', user='******', passwd='tianchi_data', db='tianchi') mongo_utils = MongodbUtils(db_address, 27017) train_user = mongo_utils.get_db().train_user # predict_set_path = '%s/temp/predict_set.csv' % (data_path) generate_userset.generate_predict_set(connect, set_timerange, predict_set_path) # predict_vectail_path = '%s/temp/predict_vectail.csv' % (data_path) # stoptime = timerange[1] cal_vecvalues_tail(train_user, predict_set_path, predict_vectail_path, behavior_timerange) predict_vecbehavior_path = predict_set_path.replace('.csv', '_calUserBehavior.csv') cal_user_behavior(connect, behavior_timerange, predict_set_path) combine_data(predict_vecbehavior_path, predict_vectail_path, csv_output_path, svm_output_path)
def cal_useritem_behavior_rate(user_id, item_id, stoptime_str='2014-12-19 00'): """ 计算指定用户对指定商品的操作数占该用户总操作数的比重 :param user_id: :param item_id: :return: """ # logger.info('cal_useritem_behavior_rate: user_id = ' + user_id + '\titem_id = ' + item_id) from data_preprocess.MongoDB_Utils import MongodbUtils mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H') max_count = train_user.find({ "user_id": user_id, "time": { "$lt": stoptime } }).count() item_behavior_count = train_user.find({ "user_id": user_id, "item_id": item_id, "time": { "$lt": stoptime } }).count() if max_count == 0: return 0 return float(item_behavior_count) / float(max_count)
def generate_positive_userset(foutpath='../data/positive_userset.json'): # 移动位置,因为服务器上有一些包依赖不完整 from data_preprocess.MongoDB_Utils import MongodbUtils db_address = json.loads(open('../conf/DB_Address.conf', 'r').read())['MongoDB_Address'] # end 移动位置 logger.info('start generate_positive_userset') mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user # user = train_user.find_one() startTime = datetime.strptime(str('2014-12-18 00'), '%Y-%m-%d %H') stopTime = datetime.strptime(str('2014-12-19 00'), '%Y-%m-%d %H') user_ids = train_user.find({"behavior_type": "4", "time": {"$gt": startTime, "$lt": stopTime}}).distinct("user_id") # print startTime fout = open(foutpath, 'w') for userid in user_ids: # datastr = userid fout.write(userid) # data = {"user_id": userid} bought_item_ids = train_user.find( {'user_id': userid, "behavior_type": "4", "time": {"$gt": startTime, "$lt": stopTime}}, {'item_id': 1, '_id': 0}).distinct("item_id") # bought_items = [] for itemid in bought_item_ids: fout.write(',' + itemid) # bought_items.append(itemid) # data['bought_items'] = bought_items # jsonstr = json.dumps(data) fout.write('\n') # fout.write(jsonstr + '\n') logger.info('generate_positive_userset done,output path = ' + foutpath)
def get_train_vecdata(train_set_path='%s/train/train_set.csv' % data_path, combined_out_path='%s/train/combined_out.csv' % data_path, svmdata_out_path='%s/train/svmdata.dat' % data_path, set_timerange=('2014-12-18', '2014-12-19'), behavior_timerange=('2014-12-12', '2014-12-19')): """ 生成训练数据集 """ import MySQLdb from data_preprocess import generate_userset from data_preprocess.MongoDB_Utils import MongodbUtils logger.info('start get_train_vecdata, timerange=%s to %s' % (set_timerange[0], set_timerange[1])) connect = MySQLdb.connect(host='10.108.192.119', user='******', passwd='tianchi_data', db='tianchi') mongo_utils = MongodbUtils(db_address, 27017) train_user = mongo_utils.get_db().train_user # generate_userset.generate_train_set(connect, ('2014-12-18', '2014-12-19'), ('2014-12-18', '2014-12-19'), # r'../data/train/train_set_1819.csv') generate_userset.generate_train_set(connect, set_timerange, set_timerange, train_set_path) vectail_path = train_set_path.replace('.csv', '_vectail.csv') cal_vecvalues_tail(train_user, train_set_path, vectail_path, behavior_timerange) # predict_vecbehavior_path = predict_set_path.replace('.csv', '_calUserBehavior.csv') cal_user_behavior_path = train_set_path.replace('.csv', '_calUserBehavior.csv') cal_user_behavior(connect, behavior_timerange, train_set_path) combine_data(cal_user_behavior_path, vectail_path, combined_out_path, svmdata_out_path)
def cal_user_desire(user_id, stoptime_str='2014-12-19 00'): """ 计算用户购买欲 :param user_id: :return:float类型的用户购买欲 """ from data_preprocess.MongoDB_Utils import MongodbUtils mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H') max_count = train_user.find({ "user_id": user_id, "time": { "$lt": stoptime } }).count() bought_count = train_user.find({ "user_id": user_id, 'behavior_type': '4', "time": { "$lt": stoptime } }).count() if max_count == 0: return 0 return float(bought_count) / float(max_count)
def rate_items(output_path=None, write_while_calculate=False): """ 遍历所有用户,计算每个用户对自己所浏览过的商品的全局评分(未考虑时间、商品种类等因素,只计入行为数)。 :param output_path: 如果需要在计算过程中写入已计算的数据,则填入输出文件路径 :param write_while_calculate: 需要在过程中写入数据则为True :return:包含所有用户的所有评分的dict """ A1 = 1.1 A2 = 2 A3 = 5 mongodb_utils = MongodbUtils(db_address, 27017) db = mongodb_utils.get_db() train_user = db.train_user json_root = [] logger.info('loading...') usercount = 0 ids = train_user.distinct("user_id") maxusercount = len(ids) logger.info('start!') # split_json = open('../data/split_json_item_rates.json', 'w') split_json = None if output_path is not None and write_while_calculate: split_json = open(output_path, 'w') for user_id in ids: user_rates_info = {'user_id': user_id} user_rates = {} itemcount = 0 usercount += 1 # print usercount # if usercount.__mod__(0.01 * maxusercount) == 0.0: # logger.debug(str(usercount / (0.01 * maxusercount)) + r'% done!') for item_id in train_user.find({"user_id": user_id}).distinct("item_id"): itemcount += 1 count1 = 0 count2 = 0 count3 = 0 for behavior in train_user.find({"user_id": user_id, "item_id": item_id}): behavior_type = behavior['behavior_type'] if behavior_type == '1': count1 += 1 elif behavior_type == '2' or behavior_type == '3': count2 += 1 elif behavior_type == '4': count3 += 1 rate = A1 ** count1 + A2 ** count2 + A3 ** count3 user_rates[item_id] = rate user_rates_info['item_count'] = itemcount user_rates_info['items'] = user_rates # 是否在计算过程中写入文件 if output_path is not None and write_while_calculate: tempstr = json.dumps(user_rates_info) split_json.write(tempstr + '\n') logger.debug(str(usercount) + 'th user ' + user_id + ' written.') json_root.append(user_rates_info) logger.info('done!') return json_root
def generate_positive_userset(foutpath='../data/positive_userset.json'): # 移动位置,因为服务器上有一些包依赖不完整 from data_preprocess.MongoDB_Utils import MongodbUtils db_address = json.loads(open('../conf/DB_Address.conf', 'r').read())['MongoDB_Address'] # end 移动位置 logger.info('start generate_positive_userset') mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user # user = train_user.find_one() startTime = datetime.strptime(str('2014-12-18 00'), '%Y-%m-%d %H') stopTime = datetime.strptime(str('2014-12-19 00'), '%Y-%m-%d %H') user_ids = train_user.find({ "behavior_type": "4", "time": { "$gt": startTime, "$lt": stopTime } }).distinct("user_id") # print startTime fout = open(foutpath, 'w') for userid in user_ids: # datastr = userid fout.write(userid) # data = {"user_id": userid} bought_item_ids = train_user.find( { 'user_id': userid, "behavior_type": "4", "time": { "$gt": startTime, "$lt": stopTime } }, { 'item_id': 1, '_id': 0 }).distinct("item_id") # bought_items = [] for itemid in bought_item_ids: fout.write(',' + itemid) # bought_items.append(itemid) # data['bought_items'] = bought_items # jsonstr = json.dumps(data) fout.write('\n') # fout.write(jsonstr + '\n') logger.info('generate_positive_userset done,output path = ' + foutpath)
def cal_item_popularity(item_id, stoptime_str='2014-12-19 00'): """ 计算商品热门度,由于被除数都一样所以不再除以被购买商品总数,改为count的sigmoid形式 :param item_id: :param stoptime:格式例如:'2014-12-18 00' :return:float类型的商品热度 """ from data_preprocess.MongoDB_Utils import MongodbUtils mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H') bought_count = train_user.find({'item_id': item_id, 'behavior_type': '4', "time": {"$lt": stoptime}}).count() popularity = 1 / (1 + math.e ** (-bought_count)) - 0.5 return popularity
def cal_user_desire(user_id, stoptime_str='2014-12-19 00'): """ 计算用户购买欲 :param user_id: :return:float类型的用户购买欲 """ from data_preprocess.MongoDB_Utils import MongodbUtils mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H') max_count = train_user.find({"user_id": user_id, "time": {"$lt": stoptime}}).count() bought_count = train_user.find({"user_id": user_id, 'behavior_type': '4', "time": {"$lt": stoptime}}).count() if max_count == 0: return 0 return float(bought_count) / float(max_count)
def cal_useritem_behavior_rate(user_id, item_id, stoptime_str='2014-12-19 00'): """ 计算指定用户对指定商品的操作数占该用户总操作数的比重 :param user_id: :param item_id: :return: """ # logger.info('cal_useritem_behavior_rate: user_id = ' + user_id + '\titem_id = ' + item_id) from data_preprocess.MongoDB_Utils import MongodbUtils mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H') max_count = train_user.find({"user_id": user_id, "time": {"$lt": stoptime}}).count() item_behavior_count = train_user.find({"user_id": user_id, "item_id": item_id, "time": {"$lt": stoptime}}).count() if max_count == 0: return 0 return float(item_behavior_count) / float(max_count)
def get_predict_vecdata( set_timerange=('2014-12-18', '2014-12-19'), behavior_timerange=('2014-12-12', '2014-12-19'), predict_set_path='%s/predict/predict_set.csv' % (data_path), predict_vectail_path='%s/predict/predict_vectail.csv' % (data_path), csv_output_path='%s/predict/combined_vec_data.csv' % (data_path), svm_output_path='%s/predict/svmdata.dat' % (data_path)): """ 生成预测集,需要制定时间范围与各输出路径 :param set_timerange: 潜在购买商品的时间范围 :param behavior_timerange: 计算向量值时的时间范围 """ from data_preprocess import generate_userset import MySQLdb from data_preprocess.MongoDB_Utils import MongodbUtils logger.info( 'start get_predict_vecdata, set_timerange=%s to %s, behavior_timerange = %s to %s' % (set_timerange[0], set_timerange[1], behavior_timerange[0], behavior_timerange[1])) connect = MySQLdb.connect(host='10.108.192.119', user='******', passwd='tianchi_data', db='tianchi') mongo_utils = MongodbUtils(db_address, 27017) train_user = mongo_utils.get_db().train_user # predict_set_path = '%s/temp/predict_set.csv' % (data_path) generate_userset.generate_predict_set(connect, set_timerange, predict_set_path) # predict_vectail_path = '%s/temp/predict_vectail.csv' % (data_path) # stoptime = timerange[1] cal_vecvalues_tail(train_user, predict_set_path, predict_vectail_path, behavior_timerange) predict_vecbehavior_path = predict_set_path.replace( '.csv', '_calUserBehavior.csv') cal_user_behavior(connect, behavior_timerange, predict_set_path) combine_data(predict_vecbehavior_path, predict_vectail_path, csv_output_path, svm_output_path)
def cal_item_popularity(item_id, stoptime_str='2014-12-19 00'): """ 计算商品热门度,由于被除数都一样所以不再除以被购买商品总数,改为count的sigmoid形式 :param item_id: :param stoptime:格式例如:'2014-12-18 00' :return:float类型的商品热度 """ from data_preprocess.MongoDB_Utils import MongodbUtils mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H') bought_count = train_user.find({ 'item_id': item_id, 'behavior_type': '4', "time": { "$lt": stoptime } }).count() popularity = 1 / (1 + math.e**(-bought_count)) - 0.5 return popularity
result.append((u_id, i_id)) new_count += 1 if old_count % 10 == 0: # logger哨兵 logger.debug('No.%s origin recommend filtered' % old_count) logger.info('done! origin recommend num is %s, current recommend num is %s\nresult output path: %s' % ( old_count, new_count, fout_path)) return result if __name__ == '__main__': from data_preprocess.MongoDB_Utils import MongodbUtils db_address = json.loads(open('%s/conf/DB_Address.conf' % (project_path), 'r').read())['MongoDB_Address'] mongo_utils = MongodbUtils(db_address, 27017) train_user = mongo_utils.get_db().train_user train_item = mongo_utils.get_db().train_item #train_user = mongo_utils.get_db().train_user_new #train_item = mongo_utils.get_db().train_item_new #find_category_relationship(train_user, train_item, '%s/relationDict.json' % data_path, 3) #f_recommend = '%s/test_1206/RandomForest_recommend_intersect.csv' % (data_path) #generate_from_popularity_in_category(f_recommend, '2014-12-06', train_user) """ # find_category_relationship(train_user, train_item, json_output_path='%s/relationDict.json' % data_path, # csv_output_path='%s/relationDict.csv' % data_path) find_category_relationship(train_user, train_item,time_window=1) # 类内热门度调用示例
if old_count % 10 == 0: # logger哨兵 logger.debug('No.%s origin recommend filtered' % old_count) logger.info( 'done! origin recommend num is %s, current recommend num is %s\nresult output path: %s' % (old_count, new_count, fout_path)) return result if __name__ == '__main__': from data_preprocess.MongoDB_Utils import MongodbUtils db_address = json.loads( open('%s/conf/DB_Address.conf' % (project_path), 'r').read())['MongoDB_Address'] mongo_utils = MongodbUtils(db_address, 27017) train_user = mongo_utils.get_db().train_user train_item = mongo_utils.get_db().train_item #train_user = mongo_utils.get_db().train_user_new #train_item = mongo_utils.get_db().train_item_new #find_category_relationship(train_user, train_item, '%s/relationDict.json' % data_path, 3) #f_recommend = '%s/test_1206/RandomForest_recommend_intersect.csv' % (data_path) #generate_from_popularity_in_category(f_recommend, '2014-12-06', train_user) """ # find_category_relationship(train_user, train_item, json_output_path='%s/relationDict.json' % data_path, # csv_output_path='%s/relationDict.csv' % data_path) find_category_relationship(train_user, train_item,time_window=1) # 类内热门度调用示例 # print cal_popularity_in_category('166670035', '2014-12-19', train_user)
def cal_user_likehood(user_id1, user_id2, data=None): """ 计算给定id的两个用户之间的相似度,默认使用数据库数据,给定data后使用内存数据 :param user_id1: :param user_id2: :param data: 由 get_user_item_data()所返回的json对象 :return: """ if user_id1 == user_id2: return 1.0 if data is None: mongodb_utils = MongodbUtils('10.108.192.165', 27017) db = mongodb_utils.get_db() userdb = db.train_user items1 = userdb.find({"user_id": user_id1}).distinct("item_id") items2 = userdb.find({"user_id": user_id2}).distinct("item_id") maxcount1 = userdb.find({"user_id": user_id1}).count() maxcount2 = userdb.find({"user_id": user_id2}).count() # common_items = [] vec1 = [] vec2 = [] fenzi = 0.0 fenmu1 = 0.0 fenmu2 = 0.0 cos = 0 for i in items1: if i in items2: x1 = userdb.find({ "user_id": user_id1, "item_id": i }).count() / math.log(float(maxcount1)) vec1.append(x1) x2 = userdb.find({ "user_id": user_id2, "item_id": i }).count() / math.log(float(maxcount2)) vec2.append(x2) fenzi += x1 * x2 fenmu1 += x1 * x1 fenmu2 += x2 * x2 if fenzi is not 0.0: # cos = fenzi / math.sqrt(fenmu1 * fenmu2) cos = math.log(fenzi) - 0.5 * (math.log(fenmu1) + math.log(fenmu2)) # logger.info('calculate user likehood between ' + user_id1 + ' and ' + user_id2 + ' is done!') return cos else: vec1 = [] vec2 = [] cos = 0.0 user1 = data[user_id1] user2 = data[user_id2] items1 = user1['items'].keys() items2 = user2['items'].keys() itemlist1 = user1['items'] itemlist2 = user2['items'] maxcount1 = math.log(float(user1['behavior_count'])) maxcount2 = math.log(float(user2['behavior_count'])) # 分子分母的初始化,此处加入一个修正项(maxcount1,maxcount2) # 以防止相似度过于接近1 fenzi = maxcount1 * maxcount2 temp = fenzi fenmu1 = maxcount1 * maxcount1 fenmu2 = maxcount2 * maxcount2 # list1 = items1.values() # list2 = items2.values() common_items1 = {} common_items2 = {} if len(items1) <= len(items2): # 外循环次数尽量最小 for i in items1: if i in items2: common_items1[i] = itemlist1[i] common_items2[i] = itemlist2[i] x1 = itemlist1[i] / maxcount1 x2 = itemlist2[i] / maxcount2 # x1 = itemlist1[i] # x2 = itemlist2[i] vec1.append(x1) vec2.append(x2) fenzi += x1 * x2 fenmu1 += x1 * x1 fenmu2 += x2 * x2 else: for i in items2: if i in items1: common_items1[i] = itemlist1[i] common_items2[i] = itemlist2[i] x1 = itemlist1[i] / maxcount1 x2 = itemlist2[i] / maxcount2 # x1 = itemlist1[i] # x2 = itemlist2[i] vec1.append(x1) vec2.append(x2) fenzi += x1 * x2 fenmu1 += x1 * x1 fenmu2 += x2 * x2 if fenzi != temp: cos = fenzi / math.sqrt(fenmu1 * fenmu2) t1 = math.log(fenzi) t2 = math.log(fenmu1) t3 = math.log(fenmu2) cos1 = math.log(fenzi) - 0.5 * (math.log(fenmu1) + math.log(fenmu2)) if cos == 1.0: print common_items1 print common_items2 # logger.info('calculate user likehood between ' + user_id1 + ' and ' + user_id2 + ' is done!') return cos
def cal_user_likehood(user_id1, user_id2, data=None): """ 计算给定id的两个用户之间的相似度,默认使用数据库数据,给定data后使用内存数据 :param user_id1: :param user_id2: :param data: 由 get_user_item_data()所返回的json对象 :return: """ if user_id1 == user_id2: return 1.0 if data is None: mongodb_utils = MongodbUtils('10.108.192.165', 27017) db = mongodb_utils.get_db() userdb = db.train_user items1 = userdb.find({"user_id": user_id1}).distinct("item_id") items2 = userdb.find({"user_id": user_id2}).distinct("item_id") maxcount1 = userdb.find({"user_id": user_id1}).count() maxcount2 = userdb.find({"user_id": user_id2}).count() # common_items = [] vec1 = [] vec2 = [] fenzi = 0.0 fenmu1 = 0.0 fenmu2 = 0.0 cos = 0 for i in items1: if i in items2: x1 = userdb.find({"user_id": user_id1, "item_id": i}).count() / math.log(float(maxcount1)) vec1.append(x1) x2 = userdb.find({"user_id": user_id2, "item_id": i}).count() / math.log(float(maxcount2)) vec2.append(x2) fenzi += x1 * x2 fenmu1 += x1 * x1 fenmu2 += x2 * x2 if fenzi is not 0.0: # cos = fenzi / math.sqrt(fenmu1 * fenmu2) cos = math.log(fenzi) - 0.5 * (math.log(fenmu1) + math.log(fenmu2)) # logger.info('calculate user likehood between ' + user_id1 + ' and ' + user_id2 + ' is done!') return cos else: vec1 = [] vec2 = [] cos = 0.0 user1 = data[user_id1] user2 = data[user_id2] items1 = user1['items'].keys() items2 = user2['items'].keys() itemlist1 = user1['items'] itemlist2 = user2['items'] maxcount1 = math.log(float(user1['behavior_count'])) maxcount2 = math.log(float(user2['behavior_count'])) # 分子分母的初始化,此处加入一个修正项(maxcount1,maxcount2) # 以防止相似度过于接近1 fenzi = maxcount1 * maxcount2 temp = fenzi fenmu1 = maxcount1 * maxcount1 fenmu2 = maxcount2 * maxcount2 # list1 = items1.values() # list2 = items2.values() common_items1 = {} common_items2 = {} if len(items1) <= len(items2): # 外循环次数尽量最小 for i in items1: if i in items2: common_items1[i] = itemlist1[i] common_items2[i] = itemlist2[i] x1 = itemlist1[i] / maxcount1 x2 = itemlist2[i] / maxcount2 # x1 = itemlist1[i] # x2 = itemlist2[i] vec1.append(x1) vec2.append(x2) fenzi += x1 * x2 fenmu1 += x1 * x1 fenmu2 += x2 * x2 else: for i in items2: if i in items1: common_items1[i] = itemlist1[i] common_items2[i] = itemlist2[i] x1 = itemlist1[i] / maxcount1 x2 = itemlist2[i] / maxcount2 # x1 = itemlist1[i] # x2 = itemlist2[i] vec1.append(x1) vec2.append(x2) fenzi += x1 * x2 fenmu1 += x1 * x1 fenmu2 += x2 * x2 if fenzi != temp: cos = fenzi / math.sqrt(fenmu1 * fenmu2) t1 = math.log(fenzi) t2 = math.log(fenmu1) t3 = math.log(fenmu2) cos1 = math.log(fenzi) - 0.5 * (math.log(fenmu1) + math.log(fenmu2)) if cos == 1.0: print common_items1 print common_items2 # logger.info('calculate user likehood between ' + user_id1 + ' and ' + user_id2 + ' is done!') return cos
def rate_items(output_path=None, write_while_calculate=False): """ 遍历所有用户,计算每个用户对自己所浏览过的商品的全局评分(未考虑时间、商品种类等因素,只计入行为数)。 :param output_path: 如果需要在计算过程中写入已计算的数据,则填入输出文件路径 :param write_while_calculate: 需要在过程中写入数据则为True :return:包含所有用户的所有评分的dict """ A1 = 1.1 A2 = 2 A3 = 5 mongodb_utils = MongodbUtils(db_address, 27017) db = mongodb_utils.get_db() train_user = db.train_user json_root = [] logger.info('loading...') usercount = 0 ids = train_user.distinct("user_id") maxusercount = len(ids) logger.info('start!') # split_json = open('../data/split_json_item_rates.json', 'w') split_json = None if output_path is not None and write_while_calculate: split_json = open(output_path, 'w') for user_id in ids: user_rates_info = {'user_id': user_id} user_rates = {} itemcount = 0 usercount += 1 # print usercount # if usercount.__mod__(0.01 * maxusercount) == 0.0: # logger.debug(str(usercount / (0.01 * maxusercount)) + r'% done!') for item_id in train_user.find({ "user_id": user_id }).distinct("item_id"): itemcount += 1 count1 = 0 count2 = 0 count3 = 0 for behavior in train_user.find({ "user_id": user_id, "item_id": item_id }): behavior_type = behavior['behavior_type'] if behavior_type == '1': count1 += 1 elif behavior_type == '2' or behavior_type == '3': count2 += 1 elif behavior_type == '4': count3 += 1 rate = A1**count1 + A2**count2 + A3**count3 user_rates[item_id] = rate user_rates_info['item_count'] = itemcount user_rates_info['items'] = user_rates # 是否在计算过程中写入文件 if output_path is not None and write_while_calculate: tempstr = json.dumps(user_rates_info) split_json.write(tempstr + '\n') logger.debug(str(usercount) + 'th user ' + user_id + ' written.') json_root.append(user_rates_info) logger.info('done!') return json_root