def regularized_correlation(size, dot_product, rating_sum, \ rating2sum, rating_norm_squared, rating2_norm_squared, virtual_cont, prior_correlation): ''' The Regularized Correlation between two vectors A, B RegularizedCorrelation = w * ActualCorrelation + (1 - w) * PriorCorrelation where w = # actualPairs / (# actualPairs + # virtualPairs). ''' unregularizedCorrelation = correlation(size, dot_product, rating_sum, \ rating2sum, rating_norm_squared, rating2_norm_squared) w = size / float(size + virtual_cont) return w * unregularizedCorrelation + (1.0 - w)
def calculate_similarity(self, pair_key, lines): ''' Sum components of each corating pair across all users who rated both item x and item y, then calculate pairwise pearson similarity and corating counts. The similarities are normalized to the [0,1] scale because we do a numerical sort. 19,21 0.4,2 21,19 0.4,2 19,70 0.6,1 70,19 0.6,1 21,70 0.1,1 70,21 0.1,1 ''' sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair items_x = [] items_y = [] for item_x, item_y in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_y += item_y sum_x += item_x n += 1 # items_x.append(item_x) # items_y.append(item_y) corr_sim = correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy) #corr_sim = correlation(items_x, items_y) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = 0.0 yield (item_xname, item_yname), (corr_sim, \ cos_sim, reg_corr_sim, jaccard_sim, n)