def load_user_data():
    """
    从数据库中读取用户数据到内存中
    (不建议常用,耗时约45分钟)
    :return:dict
    """
    mongodb_utils = MongodbUtils('10.108.192.165', 27017)
    db = mongodb_utils.get_db()
    userdb = db.train_user
    user_item_data = {}
    for user_id in userdb.distinct("user_id"):
        user_item = {}
        user_item_info = {}
        behavior_count = 0
        for behavior in userdb.find({"user_id": user_id}):
            behavior_count += 1
            item_id = behavior['item_id']
            if user_item.has_key(item_id):
                user_item[item_id] += 1
            else:
                user_item[item_id] = 1
        user_item_info['behavior_count'] = behavior_count
        user_item_info['items'] = user_item
        user_item_data[user_id] = user_item_info
        logger.info(user_id + ' done!')
    return user_item_data
def get_predict_vecdata(set_timerange=('2014-12-18', '2014-12-19'),
                        behavior_timerange=('2014-12-12', '2014-12-19'),
                        predict_set_path='%s/predict/predict_set.csv' % (data_path),
                        predict_vectail_path='%s/predict/predict_vectail.csv' % (data_path),
                        csv_output_path='%s/predict/combined_vec_data.csv' % (data_path),
                        svm_output_path='%s/predict/svmdata.dat' % (data_path)):
    """
    生成预测集,需要制定时间范围与各输出路径
    :param set_timerange: 潜在购买商品的时间范围
    :param behavior_timerange: 计算向量值时的时间范围
    """

    from data_preprocess import generate_userset
    import MySQLdb
    from data_preprocess.MongoDB_Utils import MongodbUtils

    logger.info('start get_predict_vecdata, set_timerange=%s to %s,  behavior_timerange = %s to %s' % (
        set_timerange[0], set_timerange[1], behavior_timerange[0], behavior_timerange[1]))
    connect = MySQLdb.connect(host='10.108.192.119',
                              user='******',
                              passwd='tianchi_data',
                              db='tianchi')

    mongo_utils = MongodbUtils(db_address, 27017)
    train_user = mongo_utils.get_db().train_user
    # predict_set_path = '%s/temp/predict_set.csv' % (data_path)
    generate_userset.generate_predict_set(connect, set_timerange, predict_set_path)
    # predict_vectail_path = '%s/temp/predict_vectail.csv' % (data_path)
    # stoptime = timerange[1]
    cal_vecvalues_tail(train_user, predict_set_path, predict_vectail_path, behavior_timerange)
    predict_vecbehavior_path = predict_set_path.replace('.csv', '_calUserBehavior.csv')
    cal_user_behavior(connect, behavior_timerange, predict_set_path)
    combine_data(predict_vecbehavior_path, predict_vectail_path, csv_output_path, svm_output_path)
Exemple #3
0
def cal_useritem_behavior_rate(user_id, item_id, stoptime_str='2014-12-19 00'):
    """
    计算指定用户对指定商品的操作数占该用户总操作数的比重
    :param user_id:
    :param item_id:
    :return:
    """
    # logger.info('cal_useritem_behavior_rate: user_id = ' + user_id + '\titem_id = ' + item_id)
    from data_preprocess.MongoDB_Utils import MongodbUtils

    mongodb = MongodbUtils(db_address, 27017)
    train_user = mongodb.get_db().train_user
    stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H')
    max_count = train_user.find({
        "user_id": user_id,
        "time": {
            "$lt": stoptime
        }
    }).count()
    item_behavior_count = train_user.find({
        "user_id": user_id,
        "item_id": item_id,
        "time": {
            "$lt": stoptime
        }
    }).count()
    if max_count == 0:
        return 0
    return float(item_behavior_count) / float(max_count)
def generate_positive_userset(foutpath='../data/positive_userset.json'):
    # 移动位置,因为服务器上有一些包依赖不完整
    from data_preprocess.MongoDB_Utils import MongodbUtils

    db_address = json.loads(open('../conf/DB_Address.conf', 'r').read())['MongoDB_Address']
    # end 移动位置

    logger.info('start generate_positive_userset')
    mongodb = MongodbUtils(db_address, 27017)
    train_user = mongodb.get_db().train_user
    # user = train_user.find_one()
    startTime = datetime.strptime(str('2014-12-18 00'), '%Y-%m-%d %H')
    stopTime = datetime.strptime(str('2014-12-19 00'), '%Y-%m-%d %H')
    user_ids = train_user.find({"behavior_type": "4", "time": {"$gt": startTime, "$lt": stopTime}}).distinct("user_id")
    # print startTime

    fout = open(foutpath, 'w')
    for userid in user_ids:
        # datastr = userid
        fout.write(userid)
        # data = {"user_id": userid}
        bought_item_ids = train_user.find(
            {'user_id': userid, "behavior_type": "4", "time": {"$gt": startTime, "$lt": stopTime}},
            {'item_id': 1, '_id': 0}).distinct("item_id")
        # bought_items = []
        for itemid in bought_item_ids:
            fout.write(',' + itemid)
            # bought_items.append(itemid)
        # data['bought_items'] = bought_items
        # jsonstr = json.dumps(data)
        fout.write('\n')
        # fout.write(jsonstr + '\n')
    logger.info('generate_positive_userset done,output path = ' + foutpath)
def load_user_data():
    """
    从数据库中读取用户数据到内存中
    (不建议常用,耗时约45分钟)
    :return:dict
    """
    mongodb_utils = MongodbUtils('10.108.192.165', 27017)
    db = mongodb_utils.get_db()
    userdb = db.train_user
    user_item_data = {}
    for user_id in userdb.distinct("user_id"):
        user_item = {}
        user_item_info = {}
        behavior_count = 0
        for behavior in userdb.find({"user_id": user_id}):
            behavior_count += 1
            item_id = behavior['item_id']
            if user_item.has_key(item_id):
                user_item[item_id] += 1
            else:
                user_item[item_id] = 1
        user_item_info['behavior_count'] = behavior_count
        user_item_info['items'] = user_item
        user_item_data[user_id] = user_item_info
        logger.info(user_id + ' done!')
    return user_item_data
def get_train_vecdata(train_set_path='%s/train/train_set.csv' % data_path,
                      combined_out_path='%s/train/combined_out.csv' % data_path,
                      svmdata_out_path='%s/train/svmdata.dat' % data_path,
                      set_timerange=('2014-12-18', '2014-12-19'),
                      behavior_timerange=('2014-12-12', '2014-12-19')):
    """
    生成训练数据集
    """

    import MySQLdb
    from data_preprocess import generate_userset
    from data_preprocess.MongoDB_Utils import MongodbUtils

    logger.info('start get_train_vecdata, timerange=%s to %s' % (set_timerange[0], set_timerange[1]))
    connect = MySQLdb.connect(host='10.108.192.119',
                              user='******',
                              passwd='tianchi_data',
                              db='tianchi')

    mongo_utils = MongodbUtils(db_address, 27017)
    train_user = mongo_utils.get_db().train_user
    # generate_userset.generate_train_set(connect, ('2014-12-18', '2014-12-19'), ('2014-12-18', '2014-12-19'),
    # r'../data/train/train_set_1819.csv')
    generate_userset.generate_train_set(connect, set_timerange, set_timerange,
                                        train_set_path)
    vectail_path = train_set_path.replace('.csv', '_vectail.csv')
    cal_vecvalues_tail(train_user, train_set_path, vectail_path, behavior_timerange)
    # predict_vecbehavior_path = predict_set_path.replace('.csv', '_calUserBehavior.csv')
    cal_user_behavior_path = train_set_path.replace('.csv', '_calUserBehavior.csv')
    cal_user_behavior(connect, behavior_timerange, train_set_path)
    combine_data(cal_user_behavior_path,
                 vectail_path,
                 combined_out_path,
                 svmdata_out_path)
Exemple #7
0
def cal_user_desire(user_id, stoptime_str='2014-12-19 00'):
    """
    计算用户购买欲
    :param user_id:
    :return:float类型的用户购买欲
    """
    from data_preprocess.MongoDB_Utils import MongodbUtils

    mongodb = MongodbUtils(db_address, 27017)
    train_user = mongodb.get_db().train_user
    stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H')
    max_count = train_user.find({
        "user_id": user_id,
        "time": {
            "$lt": stoptime
        }
    }).count()
    bought_count = train_user.find({
        "user_id": user_id,
        'behavior_type': '4',
        "time": {
            "$lt": stoptime
        }
    }).count()
    if max_count == 0:
        return 0
    return float(bought_count) / float(max_count)
def rate_items(output_path=None, write_while_calculate=False):
    """
    遍历所有用户,计算每个用户对自己所浏览过的商品的全局评分(未考虑时间、商品种类等因素,只计入行为数)。
    :param output_path: 如果需要在计算过程中写入已计算的数据,则填入输出文件路径
    :param write_while_calculate: 需要在过程中写入数据则为True
    :return:包含所有用户的所有评分的dict
    """
    A1 = 1.1
    A2 = 2
    A3 = 5
    mongodb_utils = MongodbUtils(db_address, 27017)
    db = mongodb_utils.get_db()
    train_user = db.train_user
    json_root = []
    logger.info('loading...')
    usercount = 0
    ids = train_user.distinct("user_id")
    maxusercount = len(ids)
    logger.info('start!')
    # split_json = open('../data/split_json_item_rates.json', 'w')
    split_json = None
    if output_path is not None and write_while_calculate:
        split_json = open(output_path, 'w')
    for user_id in ids:
        user_rates_info = {'user_id': user_id}
        user_rates = {}
        itemcount = 0
        usercount += 1
        # print usercount
        # if usercount.__mod__(0.01 * maxusercount) == 0.0:
        # logger.debug(str(usercount / (0.01 * maxusercount)) + r'% done!')
        for item_id in train_user.find({"user_id": user_id}).distinct("item_id"):
            itemcount += 1
            count1 = 0
            count2 = 0
            count3 = 0
            for behavior in train_user.find({"user_id": user_id, "item_id": item_id}):
                behavior_type = behavior['behavior_type']
                if behavior_type == '1':
                    count1 += 1
                elif behavior_type == '2' or behavior_type == '3':
                    count2 += 1
                elif behavior_type == '4':
                    count3 += 1
            rate = A1 ** count1 + A2 ** count2 + A3 ** count3
            user_rates[item_id] = rate
        user_rates_info['item_count'] = itemcount
        user_rates_info['items'] = user_rates
        # 是否在计算过程中写入文件
        if output_path is not None and write_while_calculate:
            tempstr = json.dumps(user_rates_info)
            split_json.write(tempstr + '\n')
            logger.debug(str(usercount) + 'th user ' + user_id + ' written.')
        json_root.append(user_rates_info)
    logger.info('done!')
    return json_root
Exemple #9
0
def generate_positive_userset(foutpath='../data/positive_userset.json'):
    # 移动位置,因为服务器上有一些包依赖不完整
    from data_preprocess.MongoDB_Utils import MongodbUtils

    db_address = json.loads(open('../conf/DB_Address.conf',
                                 'r').read())['MongoDB_Address']
    # end 移动位置

    logger.info('start generate_positive_userset')
    mongodb = MongodbUtils(db_address, 27017)
    train_user = mongodb.get_db().train_user
    # user = train_user.find_one()
    startTime = datetime.strptime(str('2014-12-18 00'), '%Y-%m-%d %H')
    stopTime = datetime.strptime(str('2014-12-19 00'), '%Y-%m-%d %H')
    user_ids = train_user.find({
        "behavior_type": "4",
        "time": {
            "$gt": startTime,
            "$lt": stopTime
        }
    }).distinct("user_id")
    # print startTime

    fout = open(foutpath, 'w')
    for userid in user_ids:
        # datastr = userid
        fout.write(userid)
        # data = {"user_id": userid}
        bought_item_ids = train_user.find(
            {
                'user_id': userid,
                "behavior_type": "4",
                "time": {
                    "$gt": startTime,
                    "$lt": stopTime
                }
            }, {
                'item_id': 1,
                '_id': 0
            }).distinct("item_id")
        # bought_items = []
        for itemid in bought_item_ids:
            fout.write(',' + itemid)
            # bought_items.append(itemid)
        # data['bought_items'] = bought_items
        # jsonstr = json.dumps(data)
        fout.write('\n')
        # fout.write(jsonstr + '\n')
    logger.info('generate_positive_userset done,output path = ' + foutpath)
def cal_item_popularity(item_id, stoptime_str='2014-12-19 00'):
    """
    计算商品热门度,由于被除数都一样所以不再除以被购买商品总数,改为count的sigmoid形式

    :param item_id:
    :param stoptime:格式例如:'2014-12-18 00'
    :return:float类型的商品热度
    """
    from data_preprocess.MongoDB_Utils import MongodbUtils

    mongodb = MongodbUtils(db_address, 27017)
    train_user = mongodb.get_db().train_user
    stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H')
    bought_count = train_user.find({'item_id': item_id, 'behavior_type': '4', "time": {"$lt": stoptime}}).count()
    popularity = 1 / (1 + math.e ** (-bought_count)) - 0.5
    return popularity
def cal_user_desire(user_id, stoptime_str='2014-12-19 00'):
    """
    计算用户购买欲
    :param user_id:
    :return:float类型的用户购买欲
    """
    from data_preprocess.MongoDB_Utils import MongodbUtils

    mongodb = MongodbUtils(db_address, 27017)
    train_user = mongodb.get_db().train_user
    stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H')
    max_count = train_user.find({"user_id": user_id, "time": {"$lt": stoptime}}).count()
    bought_count = train_user.find({"user_id": user_id, 'behavior_type': '4', "time": {"$lt": stoptime}}).count()
    if max_count == 0:
        return 0
    return float(bought_count) / float(max_count)
def cal_useritem_behavior_rate(user_id, item_id, stoptime_str='2014-12-19 00'):
    """
    计算指定用户对指定商品的操作数占该用户总操作数的比重
    :param user_id:
    :param item_id:
    :return:
    """
    # logger.info('cal_useritem_behavior_rate: user_id = ' + user_id + '\titem_id = ' + item_id)
    from data_preprocess.MongoDB_Utils import MongodbUtils

    mongodb = MongodbUtils(db_address, 27017)
    train_user = mongodb.get_db().train_user
    stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H')
    max_count = train_user.find({"user_id": user_id, "time": {"$lt": stoptime}}).count()
    item_behavior_count = train_user.find({"user_id": user_id, "item_id": item_id, "time": {"$lt": stoptime}}).count()
    if max_count == 0:
        return 0
    return float(item_behavior_count) / float(max_count)
def get_predict_vecdata(
        set_timerange=('2014-12-18', '2014-12-19'),
        behavior_timerange=('2014-12-12', '2014-12-19'),
        predict_set_path='%s/predict/predict_set.csv' % (data_path),
        predict_vectail_path='%s/predict/predict_vectail.csv' % (data_path),
        csv_output_path='%s/predict/combined_vec_data.csv' % (data_path),
        svm_output_path='%s/predict/svmdata.dat' % (data_path)):
    """
    生成预测集,需要制定时间范围与各输出路径
    :param set_timerange: 潜在购买商品的时间范围
    :param behavior_timerange: 计算向量值时的时间范围
    """

    from data_preprocess import generate_userset
    import MySQLdb
    from data_preprocess.MongoDB_Utils import MongodbUtils

    logger.info(
        'start get_predict_vecdata, set_timerange=%s to %s,  behavior_timerange = %s to %s'
        % (set_timerange[0], set_timerange[1], behavior_timerange[0],
           behavior_timerange[1]))
    connect = MySQLdb.connect(host='10.108.192.119',
                              user='******',
                              passwd='tianchi_data',
                              db='tianchi')

    mongo_utils = MongodbUtils(db_address, 27017)
    train_user = mongo_utils.get_db().train_user
    # predict_set_path = '%s/temp/predict_set.csv' % (data_path)
    generate_userset.generate_predict_set(connect, set_timerange,
                                          predict_set_path)
    # predict_vectail_path = '%s/temp/predict_vectail.csv' % (data_path)
    # stoptime = timerange[1]
    cal_vecvalues_tail(train_user, predict_set_path, predict_vectail_path,
                       behavior_timerange)
    predict_vecbehavior_path = predict_set_path.replace(
        '.csv', '_calUserBehavior.csv')
    cal_user_behavior(connect, behavior_timerange, predict_set_path)
    combine_data(predict_vecbehavior_path, predict_vectail_path,
                 csv_output_path, svm_output_path)
Exemple #14
0
def cal_item_popularity(item_id, stoptime_str='2014-12-19 00'):
    """
    计算商品热门度,由于被除数都一样所以不再除以被购买商品总数,改为count的sigmoid形式

    :param item_id:
    :param stoptime:格式例如:'2014-12-18 00'
    :return:float类型的商品热度
    """
    from data_preprocess.MongoDB_Utils import MongodbUtils

    mongodb = MongodbUtils(db_address, 27017)
    train_user = mongodb.get_db().train_user
    stoptime = datetime.strptime(str(stoptime_str), '%Y-%m-%d %H')
    bought_count = train_user.find({
        'item_id': item_id,
        'behavior_type': '4',
        "time": {
            "$lt": stoptime
        }
    }).count()
    popularity = 1 / (1 + math.e**(-bought_count)) - 0.5
    return popularity
def get_train_vecdata(train_set_path='%s/train/train_set.csv' % data_path,
                      combined_out_path='%s/train/combined_out.csv' %
                      data_path,
                      svmdata_out_path='%s/train/svmdata.dat' % data_path,
                      set_timerange=('2014-12-18', '2014-12-19'),
                      behavior_timerange=('2014-12-12', '2014-12-19')):
    """
    生成训练数据集
    """

    import MySQLdb
    from data_preprocess import generate_userset
    from data_preprocess.MongoDB_Utils import MongodbUtils

    logger.info('start get_train_vecdata, timerange=%s to %s' %
                (set_timerange[0], set_timerange[1]))
    connect = MySQLdb.connect(host='10.108.192.119',
                              user='******',
                              passwd='tianchi_data',
                              db='tianchi')

    mongo_utils = MongodbUtils(db_address, 27017)
    train_user = mongo_utils.get_db().train_user
    # generate_userset.generate_train_set(connect, ('2014-12-18', '2014-12-19'), ('2014-12-18', '2014-12-19'),
    # r'../data/train/train_set_1819.csv')
    generate_userset.generate_train_set(connect, set_timerange, set_timerange,
                                        train_set_path)
    vectail_path = train_set_path.replace('.csv', '_vectail.csv')
    cal_vecvalues_tail(train_user, train_set_path, vectail_path,
                       behavior_timerange)
    # predict_vecbehavior_path = predict_set_path.replace('.csv', '_calUserBehavior.csv')
    cal_user_behavior_path = train_set_path.replace('.csv',
                                                    '_calUserBehavior.csv')
    cal_user_behavior(connect, behavior_timerange, train_set_path)
    combine_data(cal_user_behavior_path, vectail_path, combined_out_path,
                 svmdata_out_path)
                    result.append((u_id, i_id))
                    new_count += 1

            if old_count % 10 == 0:  # logger哨兵
                logger.debug('No.%s origin recommend filtered' % old_count)
    logger.info('done! origin recommend num is %s, current recommend num is %s\nresult output path: %s' % (
        old_count, new_count, fout_path))
    return result


if __name__ == '__main__':
    from data_preprocess.MongoDB_Utils import MongodbUtils

    db_address = json.loads(open('%s/conf/DB_Address.conf' % (project_path), 'r').read())['MongoDB_Address']

    mongo_utils = MongodbUtils(db_address, 27017)
    train_user = mongo_utils.get_db().train_user
    train_item = mongo_utils.get_db().train_item
    #train_user = mongo_utils.get_db().train_user_new
    #train_item = mongo_utils.get_db().train_item_new

    #find_category_relationship(train_user, train_item, '%s/relationDict.json' % data_path, 3)
    #f_recommend = '%s/test_1206/RandomForest_recommend_intersect.csv' % (data_path)
    #generate_from_popularity_in_category(f_recommend, '2014-12-06', train_user)

    """
    # find_category_relationship(train_user, train_item, json_output_path='%s/relationDict.json' % data_path,
    # csv_output_path='%s/relationDict.csv' % data_path)
    find_category_relationship(train_user, train_item,time_window=1)

    # 类内热门度调用示例
Exemple #17
0
            if old_count % 10 == 0:  # logger哨兵
                logger.debug('No.%s origin recommend filtered' % old_count)
    logger.info(
        'done! origin recommend num is %s, current recommend num is %s\nresult output path: %s'
        % (old_count, new_count, fout_path))
    return result


if __name__ == '__main__':
    from data_preprocess.MongoDB_Utils import MongodbUtils

    db_address = json.loads(
        open('%s/conf/DB_Address.conf' % (project_path),
             'r').read())['MongoDB_Address']

    mongo_utils = MongodbUtils(db_address, 27017)
    train_user = mongo_utils.get_db().train_user
    train_item = mongo_utils.get_db().train_item
    #train_user = mongo_utils.get_db().train_user_new
    #train_item = mongo_utils.get_db().train_item_new

    #find_category_relationship(train_user, train_item, '%s/relationDict.json' % data_path, 3)
    #f_recommend = '%s/test_1206/RandomForest_recommend_intersect.csv' % (data_path)
    #generate_from_popularity_in_category(f_recommend, '2014-12-06', train_user)
    """
    # find_category_relationship(train_user, train_item, json_output_path='%s/relationDict.json' % data_path,
    # csv_output_path='%s/relationDict.csv' % data_path)
    find_category_relationship(train_user, train_item,time_window=1)

    # 类内热门度调用示例
    # print cal_popularity_in_category('166670035', '2014-12-19', train_user)
def cal_user_likehood(user_id1, user_id2, data=None):
    """
    计算给定id的两个用户之间的相似度,默认使用数据库数据,给定data后使用内存数据
    :param user_id1:
    :param user_id2:
    :param data: 由 get_user_item_data()所返回的json对象
    :return:
    """
    if user_id1 == user_id2:
        return 1.0
    if data is None:
        mongodb_utils = MongodbUtils('10.108.192.165', 27017)
        db = mongodb_utils.get_db()
        userdb = db.train_user
        items1 = userdb.find({"user_id": user_id1}).distinct("item_id")
        items2 = userdb.find({"user_id": user_id2}).distinct("item_id")
        maxcount1 = userdb.find({"user_id": user_id1}).count()
        maxcount2 = userdb.find({"user_id": user_id2}).count()
        # common_items = []
        vec1 = []
        vec2 = []
        fenzi = 0.0
        fenmu1 = 0.0
        fenmu2 = 0.0
        cos = 0
        for i in items1:
            if i in items2:
                x1 = userdb.find({
                    "user_id": user_id1,
                    "item_id": i
                }).count() / math.log(float(maxcount1))
                vec1.append(x1)
                x2 = userdb.find({
                    "user_id": user_id2,
                    "item_id": i
                }).count() / math.log(float(maxcount2))
                vec2.append(x2)
                fenzi += x1 * x2
                fenmu1 += x1 * x1
                fenmu2 += x2 * x2
        if fenzi is not 0.0:
            # cos = fenzi / math.sqrt(fenmu1 * fenmu2)
            cos = math.log(fenzi) - 0.5 * (math.log(fenmu1) + math.log(fenmu2))
        # logger.info('calculate user likehood between ' + user_id1 + ' and ' + user_id2 + ' is done!')
        return cos
    else:
        vec1 = []
        vec2 = []
        cos = 0.0
        user1 = data[user_id1]
        user2 = data[user_id2]
        items1 = user1['items'].keys()
        items2 = user2['items'].keys()
        itemlist1 = user1['items']
        itemlist2 = user2['items']
        maxcount1 = math.log(float(user1['behavior_count']))
        maxcount2 = math.log(float(user2['behavior_count']))
        # 分子分母的初始化,此处加入一个修正项(maxcount1,maxcount2)
        # 以防止相似度过于接近1
        fenzi = maxcount1 * maxcount2
        temp = fenzi
        fenmu1 = maxcount1 * maxcount1
        fenmu2 = maxcount2 * maxcount2
        # list1 = items1.values()
        # list2 = items2.values()
        common_items1 = {}
        common_items2 = {}
        if len(items1) <= len(items2):  # 外循环次数尽量最小
            for i in items1:
                if i in items2:
                    common_items1[i] = itemlist1[i]
                    common_items2[i] = itemlist2[i]
                    x1 = itemlist1[i] / maxcount1
                    x2 = itemlist2[i] / maxcount2
                    # x1 = itemlist1[i]
                    # x2 = itemlist2[i]
                    vec1.append(x1)
                    vec2.append(x2)
                    fenzi += x1 * x2
                    fenmu1 += x1 * x1
                    fenmu2 += x2 * x2
        else:
            for i in items2:
                if i in items1:
                    common_items1[i] = itemlist1[i]
                    common_items2[i] = itemlist2[i]
                    x1 = itemlist1[i] / maxcount1
                    x2 = itemlist2[i] / maxcount2
                    # x1 = itemlist1[i]
                    # x2 = itemlist2[i]
                    vec1.append(x1)
                    vec2.append(x2)
                    fenzi += x1 * x2
                    fenmu1 += x1 * x1
                    fenmu2 += x2 * x2
        if fenzi != temp:
            cos = fenzi / math.sqrt(fenmu1 * fenmu2)
            t1 = math.log(fenzi)
            t2 = math.log(fenmu1)
            t3 = math.log(fenmu2)
            cos1 = math.log(fenzi) - 0.5 * (math.log(fenmu1) +
                                            math.log(fenmu2))
            if cos == 1.0:
                print common_items1
                print common_items2
                # logger.info('calculate user likehood between ' + user_id1 + ' and ' + user_id2 + ' is done!')
        return cos
def cal_user_likehood(user_id1, user_id2, data=None):
    """
    计算给定id的两个用户之间的相似度,默认使用数据库数据,给定data后使用内存数据
    :param user_id1:
    :param user_id2:
    :param data: 由 get_user_item_data()所返回的json对象
    :return:
    """
    if user_id1 == user_id2:
        return 1.0
    if data is None:
        mongodb_utils = MongodbUtils('10.108.192.165', 27017)
        db = mongodb_utils.get_db()
        userdb = db.train_user
        items1 = userdb.find({"user_id": user_id1}).distinct("item_id")
        items2 = userdb.find({"user_id": user_id2}).distinct("item_id")
        maxcount1 = userdb.find({"user_id": user_id1}).count()
        maxcount2 = userdb.find({"user_id": user_id2}).count()
        # common_items = []
        vec1 = []
        vec2 = []
        fenzi = 0.0
        fenmu1 = 0.0
        fenmu2 = 0.0
        cos = 0
        for i in items1:
            if i in items2:
                x1 = userdb.find({"user_id": user_id1, "item_id": i}).count() / math.log(float(maxcount1))
                vec1.append(x1)
                x2 = userdb.find({"user_id": user_id2, "item_id": i}).count() / math.log(float(maxcount2))
                vec2.append(x2)
                fenzi += x1 * x2
                fenmu1 += x1 * x1
                fenmu2 += x2 * x2
        if fenzi is not 0.0:
            # cos = fenzi / math.sqrt(fenmu1 * fenmu2)
            cos = math.log(fenzi) - 0.5 * (math.log(fenmu1) + math.log(fenmu2))
        # logger.info('calculate user likehood between ' + user_id1 + ' and ' + user_id2 + ' is done!')
        return cos
    else:
        vec1 = []
        vec2 = []
        cos = 0.0
        user1 = data[user_id1]
        user2 = data[user_id2]
        items1 = user1['items'].keys()
        items2 = user2['items'].keys()
        itemlist1 = user1['items']
        itemlist2 = user2['items']
        maxcount1 = math.log(float(user1['behavior_count']))
        maxcount2 = math.log(float(user2['behavior_count']))
        # 分子分母的初始化,此处加入一个修正项(maxcount1,maxcount2)
        # 以防止相似度过于接近1
        fenzi = maxcount1 * maxcount2
        temp = fenzi
        fenmu1 = maxcount1 * maxcount1
        fenmu2 = maxcount2 * maxcount2
        # list1 = items1.values()
        # list2 = items2.values()
        common_items1 = {}
        common_items2 = {}
        if len(items1) <= len(items2):  # 外循环次数尽量最小
            for i in items1:
                if i in items2:
                    common_items1[i] = itemlist1[i]
                    common_items2[i] = itemlist2[i]
                    x1 = itemlist1[i] / maxcount1
                    x2 = itemlist2[i] / maxcount2
                    # x1 = itemlist1[i]
                    # x2 = itemlist2[i]
                    vec1.append(x1)
                    vec2.append(x2)
                    fenzi += x1 * x2
                    fenmu1 += x1 * x1
                    fenmu2 += x2 * x2
        else:
            for i in items2:
                if i in items1:
                    common_items1[i] = itemlist1[i]
                    common_items2[i] = itemlist2[i]
                    x1 = itemlist1[i] / maxcount1
                    x2 = itemlist2[i] / maxcount2
                    # x1 = itemlist1[i]
                    # x2 = itemlist2[i]
                    vec1.append(x1)
                    vec2.append(x2)
                    fenzi += x1 * x2
                    fenmu1 += x1 * x1
                    fenmu2 += x2 * x2
        if fenzi != temp:
            cos = fenzi / math.sqrt(fenmu1 * fenmu2)
            t1 = math.log(fenzi)
            t2 = math.log(fenmu1)
            t3 = math.log(fenmu2)
            cos1 = math.log(fenzi) - 0.5 * (math.log(fenmu1) + math.log(fenmu2))
            if cos == 1.0:
                print common_items1
                print common_items2
                # logger.info('calculate user likehood between ' + user_id1 + ' and ' + user_id2 + ' is done!')
        return cos
Exemple #20
0
def rate_items(output_path=None, write_while_calculate=False):
    """
    遍历所有用户,计算每个用户对自己所浏览过的商品的全局评分(未考虑时间、商品种类等因素,只计入行为数)。
    :param output_path: 如果需要在计算过程中写入已计算的数据,则填入输出文件路径
    :param write_while_calculate: 需要在过程中写入数据则为True
    :return:包含所有用户的所有评分的dict
    """
    A1 = 1.1
    A2 = 2
    A3 = 5
    mongodb_utils = MongodbUtils(db_address, 27017)
    db = mongodb_utils.get_db()
    train_user = db.train_user
    json_root = []
    logger.info('loading...')
    usercount = 0
    ids = train_user.distinct("user_id")
    maxusercount = len(ids)
    logger.info('start!')
    # split_json = open('../data/split_json_item_rates.json', 'w')
    split_json = None
    if output_path is not None and write_while_calculate:
        split_json = open(output_path, 'w')
    for user_id in ids:
        user_rates_info = {'user_id': user_id}
        user_rates = {}
        itemcount = 0
        usercount += 1
        # print usercount
        # if usercount.__mod__(0.01 * maxusercount) == 0.0:
        # logger.debug(str(usercount / (0.01 * maxusercount)) + r'% done!')
        for item_id in train_user.find({
                "user_id": user_id
        }).distinct("item_id"):
            itemcount += 1
            count1 = 0
            count2 = 0
            count3 = 0
            for behavior in train_user.find({
                    "user_id": user_id,
                    "item_id": item_id
            }):
                behavior_type = behavior['behavior_type']
                if behavior_type == '1':
                    count1 += 1
                elif behavior_type == '2' or behavior_type == '3':
                    count2 += 1
                elif behavior_type == '4':
                    count3 += 1
            rate = A1**count1 + A2**count2 + A3**count3
            user_rates[item_id] = rate
        user_rates_info['item_count'] = itemcount
        user_rates_info['items'] = user_rates
        # 是否在计算过程中写入文件
        if output_path is not None and write_while_calculate:
            tempstr = json.dumps(user_rates_info)
            split_json.write(tempstr + '\n')
            logger.debug(str(usercount) + 'th user ' + user_id + ' written.')
        json_root.append(user_rates_info)
    logger.info('done!')
    return json_root