Beispiel #1
0
def calculate_similarity(user_feature, article_feature):
    # NOTE: user_feature is matrix, not support bool operation
    if user_feature is None:
        return 0
    rst = _cosine_similarity(user_feature, article_feature)
    if not rst:
        return 0
    return int(rst[0, 0] * 1000)
Beispiel #2
0
def batch_calculate_similarity(user_feature, article_matrix):
    # NOTE: user_feature is matrix, not support bool operation
    if user_feature is None:
        return xrange(article_matrix.shape[0])
    sims_matrix = _cosine_similarity(user_feature, article_matrix)
    return sims_matrix[0,].tolist()
Beispiel #3
0
def cosine_similarity(x, y):
    return _cosine_similarity(x, y)[0, 0]
Beispiel #4
0
def sub_set_coll_scores(review_set, review_hist, users, restaurants):
    count = 1
    tot = review_set.shape[0]
    rest_id = None

    for rid, row in review_set.iterrows():
        user_id = row['user_id']
        curr_user = users.loc[user_id]
        old_rest_id = rest_id
        rest_id = row['business_id']

        if old_rest_id != rest_id:
            review_rest_new = review_set.loc[review_set.business_id == rest_id, rev_cols]
            review_rest_old = review_hist.loc[review_hist.business_id == rest_id, rev_cols]
            tmp_review_rest = _pd.concat([review_rest_new, review_rest_old])
            tmp_review_rest = tmp_review_rest.groupby('user_id').apply(aggregate)
            tmp_user_rest = users.loc[users.index.isin(tmp_review_rest.index)]

        review_rest = tmp_review_rest.drop(user_id)
        user_rest = tmp_user_rest.drop(user_id)
        assert review_rest.shape[0] == user_rest.shape[0], "different shapes: " + str(review_rest.shape) + " vs " + str(user_rest.shape)

        a_u = row['cuisine_av_hist']
        a_u_bin = row['cuisine_av_hist_bin']
        a_u_real = row['cuisine_av_hist_real']

        if user_rest.empty:
            res = 0
            res_bin = 0
            res_real = 0

        else:
            a_r = restaurants.loc[rest_id, 'average_stars']
            a_u_r = review_rest['stars']
            user_sim = _cosine_similarity(curr_user[cols_std].values.reshape(1, -1), user_rest[cols_std])
            user_sim = _pd.Series(data=user_sim[0], index=user_rest.index)
            user_sim.where(user_sim > 0.5, 0, inplace=True)
            numerator = (user_sim * (a_u_r - a_r)).sum()
            denominator = user_sim.sum()
            res = numerator / denominator

            a_r_bin = restaurants.loc[rest_id, 'average_stars_bin']
            a_u_r_bin = review_rest['stars_bin'].fillna(a_r_bin)
            user_sim = _cosine_similarity(curr_user[cols_bin].values.reshape(1, -1), user_rest[cols_bin])
            user_sim = _pd.Series(data=user_sim[0], index=user_rest.index)
            user_sim.where(user_sim > 0.5, 0, inplace=True)
            numerator_bin = (user_sim * (a_u_r_bin - a_r_bin)).sum()
            denominator_bin = user_sim.sum()
            res_bin = numerator_bin / denominator_bin

            a_r_real = restaurants.loc[rest_id, 'average_stars_real']
            a_u_r_real = review_rest['stars_real']
            user_sim = _cosine_similarity(curr_user[cols_real].values.reshape(1, -1), user_rest[cols_real])
            user_sim = _pd.Series(data=user_sim[0], index=user_rest.index)
            user_sim.where(user_sim > 0.5, 0, inplace=True)
            numerator_real = (user_sim * (a_u_r_real - a_r_real)).sum()
            denominator_real = user_sim.sum()
            res_real = numerator_real / denominator_real

        out_cols = ['coll_score', 'coll_score_bin', 'coll_score_real']
        vals = [a_u + res, a_u_bin + res_bin, a_u_real + res_real]
        review_set.loc[rid, out_cols] = vals

        if count % 1000 == 0:
            percent = (count / tot) * 100
            print("process {4}\t- row {0}/{1}\t- {2:.3f}%\t- {3}"
                  .format(count, tot, percent, _time.asctime(), _os.getpid()))

        count += 1