def calcular_similitud(self, pair_key, lines): """ Se suman los componentes de cada par con todos los usuarios que calificaron los items X y Y del par, luego se calcula la similitud de Pearson. Esta es normalizada entre [0,1] por el sort numerico """ sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_x += item_x sum_y += item_y n += 1 corr_sim = correlation(n, sum_xy, sum_x, sum_y, sum_xx, sum_yy) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = 0.0 yield (item_xname, item_yname), (corr_sim, cos_sim, reg_corr_sim, jaccard_sim, n)
def Cal_similarity(self,itemids, ratings): ''' Many pairs of movie and ratings For example (1,2) (5,0,4.5) (2,3) (3.1,3,3) (3,4) (2.0,3.0) (4,2) (1.0,2.3) ... ''' sum_xx,sum_yy,sum_xy,sum_x,sum_y,sumx_y,n=0.0,0.0,0.0,0.0,0.0,0.0,0 item_pairs,rate=itemids,ratings itemx,itemy=item_pairs tempx=[] tempy=[] for x,y in rate: sumx_y+=(x-y)*(x-y) sum_xx+=x*x sum_yy+=y*y sum_xy=x*y sum_x+=x sum_y+=y tempx.append(x) tempy.append(y) n+=1 tempx=np.array(tempx) tempy=np.array(tempy) tempx-=np.mean(tempx) tempy-=np.mean(tempy) tempx=tempx*tempx tempy=tempy*tempy sumx2,sumy2=sum(tempx),sum(tempy) temp=0.0 T=tempx*tempy for i in range(len(T)): temp+=T[i] # corr_sim=correaltion(n,sum_xy,sum_x,sum_y,sum_xx,sum_yy) # reg_corr_sim=regularized_correlation(n,sum_xy,sum_x,sum_y,sum_xx,sum_yy,PRIOR_COUNT,PRIOR_CORRELATION) cos_sim=cosine(sum_xy,sqrt(sum_xx),sqrt(sum_yy)) pdis=pearsondistance(temp,sqrt(sumx2),sqrt(sumy2)) # jac=jaccard(count,n,n) edis=EuclideanDistance(n,sumx_y) if n>20: yield (itemx,itemy),(cos_sim,edis,pdis,n)
def Cal_similarity(self, itemids, ratings): ''' Many pairs of movie and ratings For example (1,2) (5,0,4.5) (2,3) (3.1,3,3) (3,4) (2.0,3.0) (4,2) (1.0,2.3) ... ''' sum_xx, sum_yy, sum_xy, sum_x, sum_y, sumx_y, n = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0 item_pairs, rate = itemids, ratings itemx, itemy = item_pairs tempx = [] tempy = [] for x, y in rate: sumx_y += (x - y) * (x - y) sum_xx += x * x sum_yy += y * y sum_xy = x * y sum_x += x sum_y += y tempx.append(x) tempy.append(y) n += 1 tempx = np.array(tempx) tempy = np.array(tempy) tempx -= np.mean(tempx) tempy -= np.mean(tempy) tempx = tempx * tempx tempy = tempy * tempy sumx2, sumy2 = sum(tempx), sum(tempy) temp = 0.0 T = tempx * tempy for i in range(len(T)): temp += T[i] # corr_sim=correaltion(n,sum_xy,sum_x,sum_y,sum_xx,sum_yy) # reg_corr_sim=regularized_correlation(n,sum_xy,sum_x,sum_y,sum_xx,sum_yy,PRIOR_COUNT,PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) pdis = pearsondistance(temp, sqrt(sumx2), sqrt(sumy2)) # jac=jaccard(count,n,n) edis = EuclideanDistance(n, sumx_y) if n > 20: yield (itemx, itemy), (cos_sim, edis, pdis, n)
def calculate_similarity(self, pair_key, lines): ''' Sum components of each corating pair across all users who rated both item x and item y, then calculate pairwise pearson similarity and corating counts. The similarities are normalized to the [0,1] scale because we do a numerical sort. 19,21 0.4,2 21,19 0.4,2 19,70 0.6,1 70,19 0.6,1 21,70 0.1,1 70,21 0.1,1 ''' sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_y += item_y sum_x += item_x n += 1 corr_sim = correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = 0.0 yield (item_xname, item_yname), (corr_sim, \ cos_sim, reg_corr_sim, jaccard_sim, n)
def calculate_similarity(self, pair_key, lines): sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_y += item_y sum_x += item_x n += 1 corr_sim = correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, \ sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = 0.0 yield (item_xname, item_yname), (corr_sim, \ cos_sim, reg_corr_sim, jaccard_sim, n)
def calculate_similarity(self, pair_key, lines): sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) n_x, n_y = 0, 0 item_pair, co_ratings = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y, nx_count, ny_count in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_y += item_y sum_x += item_x n += 1 n_x = int(ny_count) n_y = int(nx_count) corr_sim = correlation(n, sum_xy, sum_x, sum_y, sum_xx, sum_yy) reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION) cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) jaccard_sim = jaccard(n, n_x, n_y) yield (item_xname, item_yname), (corr_sim, cos_sim, reg_corr_sim, jaccard_sim, n)