def fit(self, topK=50, shrink=100, similarity='cosine', feature_weighting="none", **similarity_args): # Similaripy returns also self similarity, which will be set to 0 afterwards topK += 1 self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') if similarity == "cosine": self.W_sparse = sim.cosine(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "jaccard": self.W_sparse = sim.jaccard(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "dice": self.W_sparse = sim.dice(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "jaccard": self.W_sparse = sim.tversky(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "splus": self.W_sparse = sim.s_plus(self.URM_train, k=topK, shrink=shrink, **similarity_args) else: raise ValueError( "Unknown value '{}' for similarity".format(similarity)) self.W_sparse.setdiag(0) self.W_sparse = self.W_sparse.transpose().tocsr()
def fit(self, topK=50, shrink=100, similarity='cosine', feature_weighting="none"): self.topK = topK self.shrink = shrink if feature_weighting == "bm25": self.ICM_train = similaripy.normalization.bm25(self.ICM_train) elif feature_weighting == "bm25plus": self.ICM_train = similaripy.normalization.bm25plus(self.ICM_train) elif feature_weighting == "tfidf": self.ICM_train = similaripy.normalization.tfidf(self.ICM_train) if similarity == "cosine": similarity_matrix = similaripy.cosine(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "s_plus": similarity_matrix = similaripy.s_plus(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "dice": similarity_matrix = similaripy.dice(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "rp3beta": similarity_matrix = similaripy.rp3beta(self.ICM_train, alpha=0.3, beta=0.61, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "p3alpha": similarity_matrix = similaripy.p3alpha(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) if similarity == "jaccard": similarity_matrix = similaripy.jaccard(self.ICM_train, k=self.topK, shrink=self.shrink, binary=False, verbose=False) self.W_sparse = similarity_matrix.transpose().tocsr() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, topK=50, shrink=100, similarity='cosine', pre_normalization="none", post_normalization = "none", **similarity_args): self.topK = topK self.shrink = shrink #ucm = sps.load_npz("FULL_UCM.npz") interactions = self.URM_train if pre_normalization == "bm25plus": interactions = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.85, tf_mode="raw", idf_mode="bm25", inplace=False) if pre_normalization == "tfidf": interactions = similaripy.normalization.tfidf(self.URM_train, axis=1) #interactions = sps.hstack((interactions, ucm)) if similarity == "cosine": similarity_matrix = similaripy.cosine(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "s_plus": similarity_matrix = similaripy.s_plus(interactions, k=self.topK, shrink=self.shrink, binary = False, verbose = False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "dice": similarity_matrix = similaripy.dice(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "rp3beta": similarity_matrix = similaripy.rp3beta(interactions, alpha=0.3, beta=0.61, k=self.topK, shrink=self.shrink, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "asym": similarity_matrix = similaripy.asymmetric_cosine(interactions, k=self.topK, shrink=self.shrink, alpha=0.5, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if similarity == "jaccard": similarity_matrix = similaripy.jaccard(interactions, k=self.topK, shrink=self.shrink, binary=False, verbose=False) similarity_matrix = similarity_matrix.transpose().tocsr() if post_normalization == "bm25plus_once": self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False) if post_normalization == "bm25plus_twice": self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False) self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1, k1=1.2, b=0.75, delta=0.8, tf_mode='raw', idf_mode='bm25', inplace=False) if post_normalization == "tfidf": self.URM_train = similaripy.normalization.tfidf(self.URM_train, axis=1) if post_normalization == "bm25": self.URM_train = similaripy.normalization.bm25(self.URM_train, axis=1) self.W_sparse = similarity_matrix self.W_sparse = check_matrix(self.W_sparse, format='csr')
choices=['thresh', 'knn'], type=str, required=True) args = parser.parse_args() # first load the data df_train = pd.read_csv(f'../dataset/{args.split}/train.csv', escapechar="\\") df_test = pd.read_csv(f'../dataset/{args.split}/test.csv', escapechar="\\") # ALWAYS sort the data by record_id df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True) df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True) df_train.address = df_train.address.fillna('').astype(str) df_test.address = df_test.address.fillna('').astype(str) corpus = list(df_train.address) + list(df_test.address) vectorizer = CountVectorizer(preprocessor=remove_spaces, analyzer=remove_spaces) X = vectorizer.fit_transform(corpus) X_train = X[:df_train.shape[0], :] X_test = X[df_train.shape[0]:, :] if args.mode == 'thresh': cosmatrixxx = sim.jaccard(X_test, X_train.T, k=2000) cosmatrixxx.data[cosmatrixxx.data <= 0.3] = 0 else: cosmatrixxx = sim.jaccard(X_test, X_train.T, k=300) if not os.path.isdir(f"../dataset/{args.split}/similarities"): os.makedirs(f"../dataset/{args.split}/similarities") save_npz( f'../dataset/{args.split}/similarities/jaccard_uncleaned_address_300k_{args.split}_2ngrams.npz', cosmatrixxx.tocsr())
def fit(self, topK=50, shrink=100, similarity='cosine', normalization="none", feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink reader = DataReader() icm = reader.load_icm() if normalization == "bm25": self.URM_train = similaripy.normalization.bm25(self.URM_train, axis=1) if normalization == "tfidf": self.URM_train = similaripy.normalization.tfidf(self.URM_train, axis=1) if normalization == "bm25plus": self.URM_train = similaripy.normalization.bm25plus(self.URM_train, axis=1) if feature_weighting == "bm25": icm = similaripy.normalization.bm25(icm, axis=1) if feature_weighting == "tfidf": icm = similaripy.normalization.tfidf(icm, axis=1) if feature_weighting == "bm25plus": icm = similaripy.normalization.bm25plus(icm, axis=1) matrix = sps.hstack((self.URM_train.transpose().tocsr(), icm)) if similarity == "cosine": similarity_matrix = similaripy.cosine(matrix, k=self.topK, shrink=self.shrink, binary=False, threshold=0) if similarity == "dice": similarity_matrix = similaripy.dice(matrix, k=self.topK, shrink=self.shrink, binary=False, threshold=0) if similarity == "jaccard": similarity_matrix = similaripy.jaccard(matrix, k=self.topK, shrink=self.shrink, binary=False, threshold=0) if similarity == "asym": similarity_matrix = similaripy.asymmetric_cosine( matrix, k=self.topK, shrink=self.shrink, binary=False, threshold=0) if similarity == "rp3beta": similarity_matrix = similaripy.rp3beta(matrix, k=self.topK, shrink=self.shrink, binary=False, threshold=0, alpha=0.3, beta=0.61) self.W_sparse = similarity_matrix self.W_sparse = check_matrix(self.W_sparse, format='csr')
def check_similarity(m, k, rtol=0.0001, full=False): # cython dot = sim.dot_product(m, k=k) cosine = sim.cosine(m, k=k) asy_cosine = sim.asymmetric_cosine(m, alpha=0.2, k=k) jaccard = sim.jaccard(m, k=k) dice = sim.dice(m, k=k) tversky = sim.tversky(m, alpha=0.8, beta=0.4, k=k) p3alpha = sim.p3alpha(m, alpha=0.8, k=k) rp3beta = sim.rp3beta(m, alpha=0.8, beta=0.4, k=k) # python dot2 = py_dot(m, k) cosine2 = py_cosine(m, k).tocsr() asy_cosine2 = py_asy_cosine(m, 0.2, k=k) jaccard2 = py_jaccard(m, k) dice2 = py_dice(m, k) tversky2 = py_tversky(m, alpha=0.8, beta=0.4, k=k) p3alpha2 = py_p3alpha(m, alpha=0.8, k=k) rp3beta2 = py_rp3beta(m, alpha=0.8, beta=0.4, k=k) # test np.testing.assert_allclose(check_sum(dot), check_sum(dot2), rtol=rtol, err_msg='dot error') np.testing.assert_allclose(check_sum(cosine), check_sum(cosine2), rtol=rtol, err_msg='cosine error') np.testing.assert_allclose(check_sum(asy_cosine), check_sum(asy_cosine2), rtol=rtol, err_msg='asy_cosine error') np.testing.assert_allclose(check_sum(jaccard), check_sum(jaccard2), rtol=rtol, err_msg='jaccard error') np.testing.assert_allclose(check_sum(dice), check_sum(dice2), rtol=rtol, err_msg='dice error') np.testing.assert_allclose(check_sum(tversky), check_sum(tversky2), rtol=rtol, err_msg='tversky error') np.testing.assert_allclose(check_sum(p3alpha), check_sum(p3alpha2), rtol=rtol, err_msg='p3alpha error') np.testing.assert_allclose(check_sum(rp3beta), check_sum(rp3beta2), rtol=rtol, err_msg='rp3beta error') # test full rows if full: np.testing.assert_(check_full(dot, dot2, rtol) == 0, msg='dot error') np.testing.assert_(check_full(cosine, cosine2, rtol) == 0, msg='cosine error') np.testing.assert_(check_full(asy_cosine, asy_cosine2, rtol) == 0, msg='asy_cosine error') np.testing.assert_(check_full(jaccard, jaccard2, rtol) == 0, msg='jaccard error') np.testing.assert_(check_full(dice, dice2, rtol) == 0, msg='dice error') np.testing.assert_(check_full(tversky, tversky2, rtol) == 0, msg='tversky error') np.testing.assert_(check_full(p3alpha, p3alpha2, rtol) == 0, msg='p3alpha error') np.testing.assert_(check_full(rp3beta, rp3beta2, rtol) == 0, msg='rp3beta error') return
def fit(self, matrix, k, distance, shrink=0, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None, verbose=False): """ Initialize the model and compute the similarity matrix S with a distance metric. Access the similarity matrix using: self._sim_matrix Parameters ---------- matrix : csr_matrix A sparse matrix. For example, it can be the URM of shape (number_users, number_items). k : int K nearest neighbour to consider. distance : str One of the supported distance metrics, check collaborative_filtering_base constants. shrink : float, optional Shrink term used in the normalization threshold: float, optional All the values under this value are cutted from the final result implicit: bool, optional If true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM alpha: float, optional, included in [0,1] beta: float, optional, included in [0,1] l: float, optional, balance coefficient used in s_plus distance, included in [0,1] c: float, optional, cosine coefficient, included in [0,1] """ alpha = -1 if alpha is None else alpha beta = -1 if beta is None else beta l = -1 if l is None else l c = -1 if c is None else c if distance == self.SIM_ASYMCOSINE and not (0 <= alpha <= 1): log.error( 'Invalid parameter alpha in asymmetric cosine similarity!') return if distance == self.SIM_TVERSKY and not (0 <= alpha <= 1 and 0 <= beta <= 1): log.error('Invalid parameter alpha/beta in tversky similarity!') return if distance == self.SIM_P3ALPHA and alpha is None: log.error('Invalid parameter alpha in p3alpha similarity') return if distance == self.SIM_RP3BETA and alpha is None and beta is None: log.error('Invalid parameter alpha/beta in rp3beta similarity') return if distance == self.SIM_SPLUS and not (0 <= l <= 1 and 0 <= c <= 1 and 0 <= alpha <= 1 and 0 <= beta <= 1): log.error('Invalid parameter alpha/beta/l/c in s_plus similarity') return # compute and stores the similarity matrix using one of the distance metric: S = R•R' if distance == self.SIM_COSINE: self._sim_matrix = sim.cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_ASYMCOSINE: self._sim_matrix = sim.asymmetric_cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif distance == self.SIM_JACCARD: self._sim_matrix = sim.jaccard(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_DICE: self._sim_matrix = sim.dice(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_TVERSKY: self._sim_matrix = sim.tversky(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif distance == self.SIM_P3ALPHA: self._sim_matrix = sim.p3alpha(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif distance == self.SIM_RP3BETA: self._sim_matrix = sim.rp3beta(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif distance == self.SIM_SPLUS: self._sim_matrix = sim.s_plus(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, l=l, t1=alpha, t2=beta, c=c) else: log.error('Invalid distance metric: {}'.format(distance)) #self.SIM_DOTPRODUCT: sim.dot_product(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) return self._sim_matrix
#setup parser parser = argparse.ArgumentParser() parser.add_argument("-s", "--split", help="The dataset split to use", choices=['original', 'validation'], type=str, required=True) args = parser.parse_args() # first load the data df_train = pd.read_csv(f"../dataset/{args.split}/train.csv", escapechar="\\") df_test = pd.read_csv(f"../dataset/{args.split}/test.csv", escapechar="\\") # ALWAYS sort the data by record_id df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True) df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True) df_train.address = df_train.address.astype(str) df_test.address = df_test.address.astype(str) # mi serve una colonna con tutti i telefoni su cui fare tfidf all_adds = list(df_train.address) + list(df_test.address) # daje con tfidf vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams) tf_idf_matrix = vectorizer.fit_transform(all_adds) # split tf_idf_train = tf_idf_matrix[:df_train. shape[0], :] # 691440 è la lunghezza del train tf_idf_test = tf_idf_matrix[df_train.shape[0]:, :] jac = sim.jaccard(tf_idf_test, tf_idf_train.T, k=300) save_npz(f'jaccard_tfidf_address_{args.split}.npz', jac.tocsr())
def similarity(matrix, k=100, sim_type='cosine', shrink=0, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None): # similarity type SIM_COSINE = 'cosine' SIM_TVERSKY = 'tversky' SIM_P3ALPHA = 'p3alpha' SIM_ASYMCOSINE = 'asymcosine' SIM_RP3BETA = 'rp3beta' SIM_SPLUS = 'splus' SIM_JACCARD = 'jaccard' SIM_DICE = 'dice' matrix = matrix.T if sim_type == SIM_COSINE: return sim.cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif sim_type == SIM_ASYMCOSINE: return sim.asymmetric_cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif sim_type == SIM_JACCARD: return sim.jaccard(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif sim_type == SIM_TVERSKY: return sim.tversky(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif sim_type == SIM_P3ALPHA: return sim.p3alpha(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif sim_type == SIM_RP3BETA: return sim.rp3beta(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif sim_type == SIM_SPLUS: return sim.s_plus(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) #, l=l, t1=alpha, t2=beta, c=c) elif sim_type == SIM_DICE: return sim.dice(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) else: print('Error wrong distance metric')
def fit(self): self.alpha = -1 if self.alpha is None else self.alpha self.beta = -1 if self.beta is None else self.beta self.l = -1 if self.l is None else self.l self.c = -1 if self.c is None else self.c if self.distance == self.SIM_ASYMCOSINE and not (0 <= self.alpha <= 1): log.error( 'Invalid parameter alpha in asymmetric cosine Similarity_MFD!') return if self.distance == self.SIM_TVERSKY and not (0 <= self.alpha <= 1 and 0 <= self.beta <= 1): log.error( 'Invalid parameter alpha/beta in tversky Similarity_MFD!') return if self.distance == self.SIM_P3ALPHA and self.alpha is None: log.error('Invalid parameter alpha in p3alpha Similarity_MFD') return if self.distance == self.SIM_RP3BETA and self.alpha is None and self.beta is None: log.error('Invalid parameter alpha/beta in rp3beta Similarity_MFD') return if self.distance == self.SIM_SPLUS and not ( 0 <= self.l <= 1 and 0 <= self.c <= 1 and 0 <= self.alpha <= 1 and 0 <= self.beta <= 1): log.error( 'Invalid parameter alpha/beta/l/c in s_plus Similarity_MFD') return # compute and stores the Similarity_MFD matrix using one of the distance metric: S = R•R' if self.distance == self.SIM_COSINE: self._sim_matrix = sim.cosine(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_ASYMCOSINE: self._sim_matrix = sim.asymmetric_cosine(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha) elif self.distance == self.SIM_JACCARD: self._sim_matrix = sim.jaccard(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_DICE: self._sim_matrix = sim.dice(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_TVERSKY: self._sim_matrix = sim.tversky(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha, beta=self.beta) elif self.distance == self.SIM_P3ALPHA: self._sim_matrix = sim.p3alpha(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha) elif self.distance == self.SIM_RP3BETA: self._sim_matrix = sim.rp3beta(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha, beta=self.beta) elif self.distance == self.SIM_SPLUS: self._sim_matrix = prep.normalize(sim.s_plus( self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, l=self.l, t1=self.alpha, t2=self.beta, c=self.c), norm='l2', axis=0) else: log.error('Invalid distance metric: {}'.format(self.distance)) return self._sim_matrix
string = re.sub(r'[,-./]',r'', string) ngrams = zip(*[string[i:] for i in range(n)]) return [''.join(ngram) for ngram in ngrams] #setup parser parser = argparse.ArgumentParser() parser.add_argument("-s","--split", help="The dataset split to use", choices=['original','validation'], type=str, required=True) args = parser.parse_args() # first load the data df_train = pd.read_csv(f"../dataset/{args.split}/train.csv", escapechar="\\") df_test = pd.read_csv(f"../dataset/{args.split}/test.csv", escapechar="\\") # ALWAYS sort the data by record_id df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True) df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True) df_train.name = df_train.name.astype(str) df_test.name = df_test.name.astype(str) # mi serve una colonna con tutti i nomi su cui fare tfidf all_names = list(df_train.name) + list(df_test.name) # daje con tfidf vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams) tf_idf_matrix = vectorizer.fit_transform(all_names) # split tf_idf_train = tf_idf_matrix[:df_train.shape[0],:] # 691440 è la lunghezza del train tf_idf_test = tf_idf_matrix[df_train.shape[0]:,:] cos_tfidf = sim.jaccard(tf_idf_test, tf_idf_train.T, k=300) save_npz(f'jaccard_tfidf_name_{args.split}_noclean.npz', cos_tfidf.tocsr())