def resample(self): from sklearn.neighbors import NearestNeighbors # Start with the minority class minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # Find the NNs for all samples in the data set. print("Finding the %i nearest neighbours..." % self.m, end = "") NN = NearestNeighbors(n_neighbors = self.m + 1) NN.fit(self.x) print("done!") # Boolean array with True for minority samples in danger index = asarray([in_danger(x, self.y, self.m, miny[0], NN) for x in minx]) # If all minority samples are safe, return the original data set. if not any(index): print('There are no samples in danger. No borderline synthetic samples created.') return self.x, self.y # Find the NNs among the minority class NN.set_params(**{'n_neighbors' : self.k + 1}) NN.fit(minx) nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:] # Create synthetic samples for borderline points. sx, sy = make_samples(minx[index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis = 0) ret_y = concatenate((self.y, sy), axis = 0) return ret_x, ret_y
def _compute_mi_cc(x, y, n_neighbors): """Compute mutual information between two continuous variables. I lifted this from SKLEARN Parameters ---------- x, y : ndarray, shape (n_samples,) Samples of two continuous random variables, must have an identical shape. n_neighbors : int Number of nearest neighbors to search for each point, see [1]_. Returns ------- mi : float Estimated mutual information. If it turned out to be negative it is replace by 0. Notes ----- True mutual information can't be negative. If its estimate by a numerical method is negative, it means (providing the method is adequate) that the mutual information is close to 0 and replacing it by 0 is a reasonable strategy. """ n_samples = x.size x = x.reshape((-1, 1)) y = y.reshape((-1, 1)) xy = np.hstack((x, y)) # Here we rely on NearestNeighbors to select the fastest algorithm. nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors) nn.fit(xy) radius = nn.kneighbors()[0] radius = np.nextafter(radius[:, -1], 0) # Algorithm is selected explicitly to allow passing an array as radius # later (not all algorithms support this). nn.set_params(algorithm='kd_tree') nn.fit(x) ind = nn.radius_neighbors(radius=radius, return_distance=False) nx = np.array([i.size for i in ind]) nn.fit(y) ind = nn.radius_neighbors(radius=radius, return_distance=False) ny = np.array([i.size for i in ind]) mi = (digamma(n_samples) + digamma(n_neighbors) - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1))) return max(0, mi)
def model_param(n_neighbors, algorithm, metric, n_jobs=None): model = NearestNeighbors() if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) return model
def resample(self): from sklearn.neighbors import NearestNeighbors # Start with the minority class minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # Find the NNs for all samples in the data set. print("Finding the %i nearest neighbours..." % self.m, end="") NN = NearestNeighbors(n_neighbors=self.m + 1) NN.fit(self.x) print("done!") # Boolean array with True for minority samples in danger index = asarray( [in_danger(x, self.y, self.m, self.minc, NN) for x in minx]) # If all minority samples are safe, return the original data set. if not any(index): print( 'There are no samples in danger. No borderline synthetic samples created.' ) return self.x, self.y # Find the NNs among the minority class NN.set_params(**{'n_neighbors': self.k + 1}) NN.fit(minx) nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:] # Split the number of synthetic samples between only minority (type 1), or # minority and majority (with reduced step size) (type 2. Pyseed(self.rs) fractions = min(max(gauss(0.5, 0.1), 0), 1) # Only minority sx1, sy1 = make_samples(minx[index], minx, self.minc, nns,\ fractions * (int(self.ratio * len(miny)) + 1),\ step_size=1,\ random_state=self.rs) # Only majority with smaller step size sx2, sy2 = make_samples(minx[index], self.x[self.y != self.minc], self.minc, nns,\ (1 - fractions) * int(self.ratio * len(miny)),\ step_size=0.5,\ random_state=self.rs) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx1, sx2), axis=0) ret_y = concatenate((self.y, sy1, sy2), axis=0) return ret_x, ret_y
def load(): #API calls request = requests.get( 'http://api.beerless.be/api/tastingprofiles/averages?access_token=smCLVeBjK79ywuPJFRI599qiu1JFFgKVJrVCq9mtzV0Nus5j5IYB9B8B9uthSTc6' ) x = request.json() test = pd.DataFrame(x) df_tastingprofiles = test[[ 'beerId', 'malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity' ]] # pivot and create tastingprofile matrix df_tastingprofile_features = df_tastingprofiles.set_index('beerId') #Configuring Google Cloud storage client = storage.Client() bucket = client.get_bucket("beerless-scripts-1.appspot.com") beerIDPickle = bucket.blob("beerID.pickle") # Upload pickle dump beerIDPickle.upload_from_string( pickle.dumps(df_tastingprofiles, protocol=pickle.HIGHEST_PROTOCOL)) #Creating matrix mat_tastingprofile_features = csr_matrix( df_tastingprofile_features.values) #Saving data dataPickle = bucket.blob("data.pickle") dataPickle.upload_from_string( pickle.dumps(mat_tastingprofile_features, protocol=pickle.HIGHEST_PROTOCOL)) #creating models model = NearestNeighbors() #adding parameters to model model.set_params(n_neighbors=20, algorithm='brute', metric='cosine', n_jobs=-1) # fit model.fit(mat_tastingprofile_features) #saving model to file modelPickle = bucket.blob("model.pickle") modelPickle.upload_from_string( pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)) # clean up del df_tastingprofiles, df_tastingprofile_features
def resample(self): from sklearn.neighbors import NearestNeighbors # Start with the minority class minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # Find the NNs for all samples in the data set. print("Finding the %i nearest neighbours..." % self.m, end = "") NN = NearestNeighbors(n_neighbors = self.m + 1) NN.fit(self.x) print("done!") # Boolean array with True for minority samples in danger index = asarray([in_danger(x, self.y, self.m, self.minc, NN) for x in minx]) # If all minority samples are safe, return the original data set. if not any(index): print('There are no samples in danger. No borderline synthetic samples created.') return self.x, self.y # Find the NNs among the minority class NN.set_params(**{'n_neighbors' : self.k + 1}) NN.fit(minx) nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:] # Split the number of synthetic samples between only minority (type 1), or # minority and majority (with reduced step size) (type 2. Pyseed(self.rs) fractions = min(max(gauss(0.5, 0.1), 0), 1) # Only minority sx1, sy1 = make_samples(minx[index], minx, self.minc, nns,\ fractions * (int(self.ratio * len(miny)) + 1),\ step_size=1,\ random_state=self.rs) # Only majority with smaller step size sx2, sy2 = make_samples(minx[index], self.x[self.y != self.minc], self.minc, nns,\ (1 - fractions) * int(self.ratio * len(miny)),\ step_size=0.5,\ random_state=self.rs) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx1, sx2), axis = 0) ret_y = concatenate((self.y, sy1, sy2), axis = 0) return ret_x, ret_y
def runprogram(): model = NearestNeighbors() model.set_params(**{ 'n_neighbors': 20, 'algorithm': 'brute', 'metric': 'cosine', 'n_jobs': -1 }) p = str(input()) while (p) != '-1': P_U_SP, H_MAP = preprocess() ans = recSystem(p, model, P_U_SP, H_MAP) for s in ans: print(s) sleep(0.6) print('\n') p = str(input())
def _instantiate_nearest_neighbors_object(self): backend = self.knn_backend if backend == "sklearn": backend_instance = NearestNeighbors(algorithm="auto") elif callable(backend): backend_instance = backend() self.metric = backend_instance.metric elif hasattr(backend, "fit") and hasattr(backend, "kneighbors"): backend_instance = sk_clone(backend) self.metric = backend_instance.metric else: raise NotImplementedError( "`knn_backend` must be either an NearestNeighbors-like object," " a callable returning such an object, or the string \"sklearn\"" ) backend_instance.set_params(**self._get_metric_dict()) return backend_instance
def networkBuildKnn(X_net, Y_net, knn, labels=False): g = nx.Graph() lnNet = len(X_net) g.graph["lnNet"] = lnNet g.graph["classNames"] = list(set(Y_net)) for index, instance in enumerate(X_net): g.add_node(str(index), value=instance, typeNode='net', label=Y_net[index]) values = X_net if (isinstance(values[0], (int, float, str))): values = [e[0] for e in values] nbrs = NearestNeighbors(knn + 1, metric='euclidean') nbrs.fit(values) distances, indices = nbrs.kneighbors(values) indices = indices[:, 1:] distances = distances[:, 1:] eRadius = np.quantile(distances, 0.5) nbrs.set_params(radius=eRadius) for indiceNode, indicesNode in enumerate(indices): for tmpi, indice in enumerate(indicesNode): if (g.nodes()[str(indice)]["label"] == g.nodes()[str(indiceNode)]["label"] or not labels): g.add_edge(str(indice), str(indiceNode), weight=distances[indiceNode][tmpi]) distances, indices = nbrs.radius_neighbors([instance]) for indiceNode, indicesNode in enumerate(indices): for tmpi, indice in enumerate(indicesNode): if (not str(indice) == str(indiceNode)): if (g.nodes()[str(indice)]["label"] == g.nodes()[str(indiceNode)]["label"] or not labels): g.add_edge(str(indice), str(indiceNode), weight=distances[indiceNode][tmpi]) g.graph["index"] = lnNet return g, nbrs
def mi_3(x, y, z, n_neighbors): """Compute mutual information between three continuous variables. I lifted this from SKLEARN """ n_samples = x.size x = x.reshape((-1, 1)) y = y.reshape((-1, 1)) z = y.reshape((-1, 1)) xyz = np.hstack((x, y, z)) # Here we rely on NearestNeighbors to select the fastest algorithm. nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors) nn.fit(xyz) radius = nn.kneighbors()[0] radius = np.nextafter(radius[:, -1], 0) # Algorithm is selected explicitly to allow passing an array as radius # later (not all algorithms support this). nn.set_params(algorithm='kd_tree') nn.fit(x) ind = nn.radius_neighbors(radius=radius, return_distance=False) nx = np.array([i.size for i in ind]) nn.fit(y) ind = nn.radius_neighbors(radius=radius, return_distance=False) ny = np.array([i.size for i in ind]) nn.fit(z) ind = nn.radius_neighbors(radius=radius, return_distance=False) nz = np.array([i.size for i in ind]) mi = (digamma(n_samples) + digamma(n_neighbors) - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)) - np.mean(digamma(nz + 1))) return max(0, mi)
def _compute_mi_cd(c, d, n_neighbors): n_samples = c.shape[0] c = c.reshape((-1, 1)) radius = np.empty(n_samples) label_counts = np.empty(n_samples) k_all = np.empty(n_samples) nn = NearestNeighbors() for label in np.unique(d): mask = d == label count = np.sum(mask) if count > 1: k = min(n_neighbors, count - 1) nn.set_params(n_neighbors=k) nn.fit(c[mask]) r = nn.kneighbors()[0] radius[mask] = np.nextafter(r[:, -1], 0) k_all[mask] = k label_counts[mask] = count # Ignore points with unique labels. mask = label_counts > 1 n_samples = np.sum(mask) label_counts = label_counts[mask] k_all = k_all[mask] c = c[mask] radius = radius[mask] nn.set_params(algorithm='kd_tree') nn.fit(c) ind = nn.radius_neighbors(radius=radius, return_distance=False) m_all = np.array([i.size for i in ind]) mi = (digamma(n_samples) + np.mean(digamma(k_all)) - np.mean(digamma(label_counts)) - np.mean(digamma(m_all + 1))) return max(0, mi)
def resample(self): from sklearn.svm import SVC from sklearn.neighbors import NearestNeighbors svc = SVC() svc.set_params(**self.svm_args) # Fit SVM and find the support vectors svc.fit(self.x, self.y) support_index = svc.support_[self.y[svc.support_] == self.minc] support_vetor = self.x[support_index] # Start with the minority class minx = self.x[self.y == self.minc] # First, find the NN of all the samples to identify samples in danger and noisy ones print("Finding the %i nearest neighbours..." % self.m, end = "") NN = NearestNeighbors(n_neighbors = self.m + 1) NN.fit(self.x) print("done!") # Now, get rid of noisy support vectors # Boolean array with True for noisy support vectors noise_bool = asarray([is_noise(x, self.y, self.m, self.minc, NN) for x in support_vetor]) # Remove noisy support vectors support_vetor = support_vetor[logical_not(noise_bool)] # Find support_vectors there are in danger (interpolation) or not (extrapolation) danger_bool = asarray([in_danger(x, self.y, self.m, self.minc, NN) for x in support_vetor]) safety_bool = logical_not(danger_bool) print_stats = (len(support_vetor), nsum(noise_bool), nsum(danger_bool), nsum(safety_bool)) print("Out of %i support vectors, %i are noisy, %i are in danger and %i are safe." % print_stats) # Proceed to find support vectors NNs among the minority class print("Finding the %i nearest neighbours..." % self.k, end = "") NN.set_params(**{'n_neighbors' : self.k + 1}) NN.fit(minx) print("done!") print("Creating synthetic samples...", end = "") # Split the number of synthetic samples between interpolation and extrapolation Pyseed(self.rs) fractions = min(max(gauss(0.5, 0.1), 0), 1) # Interpolate samples in danger nns = NN.kneighbors(support_vetor[danger_bool], return_distance=False)[:, 1:] sx1, sy1 = make_samples(support_vetor[danger_bool], minx, self.minc, nns,\ fractions * (int(self.ratio * len(minx)) + 1),\ step_size=1,\ random_state=self.rs) # Extrapolate safe samples nns = NN.kneighbors(support_vetor[safety_bool], return_distance=False)[:, 1:] sx2, sy2 = make_samples(support_vetor[safety_bool], minx, self.minc, nns,\ (1 - fractions) * int(self.ratio * len(minx)),\ step_size=-self.out_step,\ random_state=self.rs) print("done!") # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx1, sx2), axis=0) ret_y = concatenate((self.y, sy1, sy2), axis=0) return ret_x, ret_y
class KnnRecommender: """ This is an item-based collaborative filtering recommender with KNN implmented by sklearn """ def __init__(self, path_movies, path_ratings): """ Recommender requires path to data: movies data and ratings data Parameters ---------- path_movies: str, movies data file path path_ratings: str, ratings data file path """ self.path_movies = path_movies self.path_ratings = path_ratings self.movie_rating_thres = 0 self.user_rating_thres = 0 self.model = NearestNeighbors() def set_filter_params(self, movie_rating_thres, user_rating_thres): """ set rating frequency threshold to filter less-known movies and less active users Parameters ---------- movie_rating_thres: int, minimum number of ratings received by users user_rating_thres: int, minimum number of ratings a user gives """ self.movie_rating_thres = movie_rating_thres self.user_rating_thres = user_rating_thres def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None): """ set model params for sklearn.neighbors.NearestNeighbors Parameters ---------- n_neighbors: int, optional (default = 5) algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional metric: string or callable, default 'minkowski', or one of ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] n_jobs: int or None, optional (default=None) """ if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) def _prep_data(self): """ prepare data for recommender 1. movie-user scipy sparse matrix 2. hashmap of movie to row index in movie-user scipy sparse matrix """ # read data df_movies = pd.read_csv(os.path.join(self.path_movies), usecols=['movieId', 'title'], dtype={ 'movieId': 'int32', 'title': 'str' }) df_ratings = pd.read_csv(os.path.join(self.path_ratings), usecols=['userId', 'movieId', 'rating'], dtype={ 'userId': 'int32', 'movieId': 'int32', 'rating': 'float32' }) # filter data df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count']) popular_movies = list( set( df_movies_cnt.query( 'count >= @self.movie_rating_thres').index)) # noqa movies_filter = df_ratings.movieId.isin(popular_movies).values df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(), columns=['count']) active_users = list( set(df_users_cnt.query( 'count >= @self.user_rating_thres').index)) # noqa users_filter = df_ratings.userId.isin(active_users).values df_ratings_filtered = df_ratings[movies_filter & users_filter] # pivot and create movie-user matrix movie_user_mat = df_ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0) # create mapper from movie title to index hashmap = { movie: i for i, movie in enumerate( list( df_movies.set_index('movieId').loc[ movie_user_mat.index].title)) # noqa } # transform matrix to scipy sparse matrix movie_user_mat_sparse = csr_matrix(movie_user_mat.values) # clean up del df_movies, df_movies_cnt, df_users_cnt del df_ratings, df_ratings_filtered, movie_user_mat gc.collect() return movie_user_mat_sparse, hashmap # 查找要搜索的电影是否在检索库中 def _fuzzy_matching(self, hashmap, fav_movie): """ return the closest match via fuzzy ratio. If no match found, return None Parameters ---------- hashmap: dict, map movie title name to index of the movie in data fav_movie: str, name of user input movie Return ------ index of the closest match """ match_tuple = [] # get match for title, idx in hashmap.items(): ratio = fuzz.ratio(title.lower(), fav_movie.lower()) if ratio >= 60: match_tuple.append((title, idx, ratio)) # sort match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1] if not match_tuple: print('Oops! No match is found') else: print('Found possible matches in our database: ' '{0}\n'.format([x[0] for x in match_tuple])) return match_tuple[0][1] def _inference(self, model, data, hashmap, fav_movie, n_recommendations): """ return top n similar movie recommendations based on user's input movie Parameters ---------- model: sklearn model, knn model data: movie-user matrix hashmap: dict, map movie title name to index of the movie in data fav_movie: str, name of user input movie n_recommendations: int, top n recommendations Return ------ list of top n similar movie recommendations """ # fit model.fit(data) # get input movie index print('You have input movie:', fav_movie) idx = self._fuzzy_matching(hashmap, fav_movie) # inference print('Recommendation system start to make inference') print('......\n') t0 = time.time() distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations + 1) # get list of raw idx of recommendations raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key=lambda x: x[1] )[:0:-1] print('It took my system {:.2f}s to make inference \n\ '.format(time.time() - t0)) # return recommendation (movieId, distance) return raw_recommends def make_recommendations(self, fav_movie, n_recommendations): """ make top n movie recommendations Parameters ---------- fav_movie: str, name of user input movie n_recommendations: int, top n recommendations """ # get data movie_user_mat_sparse, hashmap = self._prep_data() # get recommendations raw_recommends = self._inference(self.model, movie_user_mat_sparse, hashmap, fav_movie, n_recommendations) # print results reverse_hashmap = {v: k for k, v in hashmap.items()} print('Recommendations for {}:'.format(fav_movie)) for i, (idx, dist) in enumerate(raw_recommends): print('{0}: {1}, with distance ' 'of {2}'.format(i + 1, reverse_hashmap[idx], dist))
def _compute_mi_cd(c, d, n_neighbors): """Compute mutual information between continuous and discrete variables. Parameters ---------- c : ndarray, shape (n_samples,) Samples of a continuous random variable. d : ndarray, shape (n_samples,) Samples of a discrete random variable. n_neighbors : int Number of nearest neighbors to search for each point, see [1]_. Returns ------- mi : float Estimated mutual information. If it turned out to be negative it is replace by 0. Notes ----- True mutual information can't be negative. If its estimate by a numerical method is negative, it means (providing the method is adequate) that the mutual information is close to 0 and replacing it by 0 is a reasonable strategy. References ---------- .. [1] B. C. Ross "Mutual Information between Discrete and Continuous Data Sets". PLoS ONE 9(2), 2014. """ n_samples = c.shape[0] if len(c.shape) == 1: c = c.reshape([-1, 1]) radius = np.empty(n_samples) label_counts = np.empty(n_samples) k_all = np.empty(n_samples) nn = NearestNeighbors() for label in np.unique(d, axis=0): mask = np.all(d == label, axis=-1) count = np.sum(mask) if count > 1: k = min(n_neighbors, count - 1) nn.set_params(n_neighbors=k) nn.fit(c[mask]) r = nn.kneighbors()[0] radius[mask] = np.nextafter(r[:, -1], 0) k_all[mask] = k label_counts[mask] = count # Ignore points with unique labels. mask = label_counts > 1 n_samples = np.sum(mask) label_counts = label_counts[mask] k_all = k_all[mask] c = c[mask] radius = radius[mask] nn.set_params(algorithm='kd_tree') nn.fit(c) ind = nn.radius_neighbors(radius=radius, return_distance=False) m_all = np.array([i.size for i in ind]) mi = (digamma(n_samples) + np.mean(digamma(k_all)) - np.mean(digamma(label_counts)) - np.mean(digamma(m_all + 1))) return max(0, mi)
def make_inferece(data, hashmap, target_title, n_recommendations, n_neighbors, algorithm, metric, n_jobs): """ return top n similar items recommendations based on user's input item Parameters ---------- model: sklearn model, knn model data: item-user matrix hashmap: dict, map item title name to index of the item in data fav_title: str, name of user input item n_recommendations: int, top n recommendations Return ------ list of top n similar item recommendations """ model = NearestNeighbors() """ set model params for sklearn.neighbors.NearestNeighbors Parameters ---------- n_neighbors: int, optional (default = 5) algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional metric: string or callable, default 'minkowski', or one of ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] n_jobs: int or None, optional (default=None) """ model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) model.fit(data) idx = fuzzy_matching(hashmap, target_title) # Inference #print('Recommendation system start to make inference') #print('......\n') t0 = time.time() distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations + 1) raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key = lambda x: x[1] )[:0:-1] #print('It took my system {:.2f}s to make inference \n'.format(time.time() - t0)) # return recommendation (movieId, distance) return raw_recommends
class KnnRecommender: """ This is an item-based collaborative filtering recommender with KNN implmented by sklearn """ def __init__(self): """ Recommender requires path to data: movies data and movie_ratings data """ self.model = NearestNeighbors() self.model.set_params( **{ 'n_neighbors': 20, 'algorithm': 'brute', 'metric': 'cosine', 'n_jobs': -1 }) self.data = MovieRatingData() data = self.data.movie_user_mat_sparse self.model.fit(data) def _fuzzy_matching(self, movie): """ return the closest match via fuzzy ratio. If no match found, return None Parameters ---------- hashmap: dict, map movie title name to index of the movie in data fav_movie: str, name of user input movie Return ------ index of the closest match """ match_tuples = [] # get match for title, idx in self.data.movies_to_csr_indices.items(): ratio = fuzz.ratio(title.lower(), movie.lower()) if ratio >= 60: match_tuples.append((title, idx, ratio)) # sort return None if not match_tuples else sorted( match_tuples, key=itemgetter(2), reverse=True)[0][1] def _inference(self, movie): """ return top n similar movie recommendations based on user's input movie Parameters ---------- model: sklearn model, knn model movie: str, name of user input movie Return ------ list of top n similar movie recommendations """ data = self.data.movie_user_mat_sparse movie_idx = self._fuzzy_matching(movie) distances, indices = self.model.kneighbors(data[movie_idx], n_neighbors=6) distances, indices = distances.squeeze().tolist(), indices.squeeze( ).tolist() raw_recommends = sorted(list(zip(indices, distances)), key=itemgetter(1))[:0:-1] # return recommendation (movieId, distance) return raw_recommends def make_recommendations(self, movie): """ make top n movie recommendations Parameters ---------- movie: str, name of user input movie n_recommendations: int, top n recommendations """ # get recommendations raw_recommends = self._inference(movie) indices_to_movies = { v: k for k, v in self.data.movies_to_csr_indices.items() } movie_names = [indices_to_movies[i[0]] for i in raw_recommends] return movie_names
class SMOTE(OverSampler): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique, and the variants Borderline SMOTE 1, 2 and SVM-SMOTE. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Whether or not to print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm'. Attributes ---------- ratio : str or float If 'auto', the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None Seed for random number generation. min_c_ : str or int The identifier of the minority class. max_c_ : str or int The identifier of the majority class. stats_c_ : dict of str/int : int A dictionary in which the number of occurences of each class is reported. X_shape_ : tuple of int Shape of the data `X` during fitting. Notes ----- See the original papers: [1]_, [2]_, [3]_ for more details. It does not support multiple classes automatically, but can be called multiple times. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning," Advances in intelligent computing, 878-887, 2005. .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for imbalanced data classification," International Journal of Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. """ def __init__(self, ratio='auto', random_state=None, verbose=True, k=5, m=10, out_step=0.5, kind='regular', n_jobs=-1, **kwargs): """Initialisation of SMOTE object. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Whether or not to print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm'. n_jobs : int, optional (default=-1) Number of threads to run the algorithm when it is possible. """ super(SMOTE, self).__init__(ratio=ratio, random_state=random_state, verbose=verbose) # --- The type of smote # This object can perform regular smote over-sampling, borderline 1, # borderline 2 and svm smote. Since the algorithms are fairly simple # they share most methods. possible_kind = ('regular', 'borderline1', 'borderline2', 'svm') if kind in possible_kind: self.kind = kind else: raise ValueError('Unknown kind for SMOTE algorithm.') self.k = k self.m = m self.out_step = out_step self.verbose = verbose self.kwargs = kwargs self.n_jobs = n_jobs # --- NN object # Import the NN object from scikit-learn library. Since in the smote # variations we must first find samples that are in danger, we # initialize the NN object differently depending on the method chosen if kind == 'regular': # Regular smote does not look for samples in danger, instead it # creates synthetic samples directly from the k-th nearest # neighbours with not filtering self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1, n_jobs=self.n_jobs) else: # Borderline1, 2 and SVM variations of smote must first look for # samples that could be considered noise and samples that live # near the boundary between the classes. Therefore, before # creating synthetic samples from the k-th nns, it first look # for m nearest neighbors to decide whether or not a sample is # noise or near the boundary. self.nearest_neighbour = NearestNeighbors(n_neighbors=self.m + 1, n_jobs=self.n_jobs) # --- SVM smote # Unlike the borderline variations, the SVM variation uses the support # vectors to decide which samples are in danger (near the boundary). # Additionally it also introduces extrapolation for samples that are # considered safe (far from boundary) and interpolation for samples # in danger (near the boundary). The level of extrapolation is # controled by the out_step. if kind == 'svm': # Store SVM object with any parameters self.svm = SVC(random_state=self.random_state, **self.kwargs) def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(SMOTE, self).fit(X, y) return self def _in_danger_noise(self, samples, y, kind='danger'): """Estimate if a set of sample are in danger or noise. Parameters ---------- samples : ndarray, shape (n_samples, n_features) The samples to check if either they are in danger or not. y : ndarray, shape (n_samples, ) The true label in order to check the neighbour labels. kind : str, optional (default='danger') The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray, shape (n_samples, ) A boolean array where True refer to samples in danger or noise. """ # Find the NN for each samples # Exclude the sample itself x = self.nearest_neighbour.kneighbors(samples, return_distance=False)[:, 1:] # Count how many NN belong to the minority class # Find the class corresponding to the label in x nn_label = (y[x] != self.min_c_).astype(int) # Compute the number of majority samples in the NN n_maj = np.sum(nn_label, axis=1) if kind == 'danger': # Samples are in danger for m/2 <= m' < m return np.bitwise_and(n_maj >= float(self.m) / 2., n_maj < self.m) elif kind == 'noise': # Samples are noise for m = m' return n_maj == self.m else: raise NotImplementedError def _make_samples(self, X, y_type, nn_data, nn_num, n_samples, step_size=1.): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : ndarray, shape (n_samples, n_features) Points from which the points will be created. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray, shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in nn_data. n_samples : int The number of samples to generate. step_size : float, optional (default=1.) The step size to create samples. Returns ------- X_new : ndarray, shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray, shape (n_samples_new, ) Target values for synthetic samples. """ # Check the consistency of X X = check_array(X) # A matrix to store the synthetic samples X_new = np.zeros((n_samples, X.shape[1])) # Set seeds np.random.seed(self.random_state) seeds = np.random.randint(low=0, high=100 * len(nn_num.flatten()), size=n_samples) # Randomly pick samples to construct neighbours from np.random.seed(self.random_state) samples = np.random.randint(low=0, high=len(nn_num.flatten()), size=n_samples) # Loop over the NN matrix and create new samples for i, n in enumerate(samples): # NN lines relate to original sample, columns to its # nearest neighbours row, col = divmod(n, nn_num.shape[1]) # Take a step of random size (0,1) in the direction of the # n nearest neighbours if self.random_state is None: np.random.seed(seeds[i]) else: np.random.seed(self.random_state) step = step_size * np.random.uniform() # Construct synthetic sample X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) # The returned target vector is simply a repetition of the # minority label y_new = np.array([y_type] * len(X_new)) if self.verbose: print("Generated {} new samples ...".format(len(X_new))) return X_new, y_new def sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(SMOTE, self).sample(X, y) # Define the number of sample to create # We handle only two classes problem for the moment. if self.ratio == 'auto': num_samples = (self.stats_c_[self.maj_c_] - self.stats_c_[self.min_c_]) else: num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) - self.stats_c_[self.min_c_]) # Start by separating minority class features and target values. X_min = X[y == self.min_c_] # If regular SMOTE is to be performed if self.kind == 'regular': # Print if verbose is true# if self.verbose: print('Finding the {} nearest neighbours...'.format(self.k)) # Look for k-th nearest neighbours, excluding, of course, the # point itself. self.nearest_neighbour.fit(X_min) # Matrix with k-th nearest neighbours indexes for each minority # element. nns = self.nearest_neighbour.kneighbors(X_min, return_distance=False)[:, 1:] # Print status if verbose is true if self.verbose: print("done!") print("Creating synthetic samples...", end="") # --- Generating synthetic samples # Use static method make_samples to generate minority samples X_new, y_new = self._make_samples(X_min, self.min_c_, X_min, nns, num_samples, 1.0) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set X_resampled = np.concatenate((X, X_new), axis=0) y_resampled = np.concatenate((y, y_new), axis=0) return X_resampled, y_resampled if self.kind == 'borderline1' or self.kind == 'borderline2': if self.verbose: print("Finding the {} nearest neighbours...".format(self.m)) # Find the NNs for all samples in the data set. self.nearest_neighbour.fit(X) if self.verbose: print("done!") # Boolean array with True for minority samples in danger danger_index = self._in_danger_noise(X_min, y, kind='danger') # If all minority samples are safe, return the original data set. if not any(danger_index): if self.verbose: print('There are no samples in danger. No borderline ' 'synthetic samples created.') # All are safe, nothing to be done here. return X, y # If we got here is because some samples are in danger, we need to # find the NNs among the minority class to create the new synthetic # samples. # # We start by changing the number of NNs to consider from m + 1 # to k + 1 self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour.fit(X_min) # nns...# nns = self.nearest_neighbour.kneighbors(X_min[danger_index], return_distance=False)[:, 1:] # B1 and B2 types diverge here!!! if self.kind == 'borderline1': # Create synthetic samples for borderline points. X_new, y_new = self._make_samples(X_min[danger_index], self.min_c_, X_min, nns, num_samples) # Concatenate the newly generated samples to the original # dataset X_resampled = np.concatenate((X, X_new), axis=0) y_resampled = np.concatenate((y, y_new), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour.set_params( **{'n_neighbors': self.m + 1}) return X_resampled, y_resampled else: # Split the number of synthetic samples between only minority # (type 1), or minority and majority (with reduced step size) # (type 2). np.random.seed(self.random_state) # The fraction is sampled from a beta distribution centered # around 0.5 with variance ~0.01 fractions = beta(10, 10) # Only minority X_new_1, y_new_1 = self._make_samples(X_min[danger_index], self.min_c_, X_min, nns, int(fractions * (num_samples + 1)), step_size=1.) # Only majority with smaller step size X_new_2, y_new_2 = self._make_samples(X_min[danger_index], self.min_c_, X[y != self.min_c_], nns, int((1 - fractions) * num_samples), step_size=0.5) # Concatenate the newly generated samples to the original # data set X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour.set_params( **{'n_neighbors': self.m + 1}) return X_resampled, y_resampled if self.kind == 'svm': # The SVM smote model fits a support vector machine # classifier to the data and uses the support vector to # provide a notion of boundary. Unlike regular smote, where # such notion relies on proportion of nearest neighbours # belonging to each class. # Fit SVM to the full data# self.svm.fit(X, y) # Find the support vectors and their corresponding indexes support_index = self.svm.support_[y[self.svm.support_] == self.min_c_] support_vector = X[support_index] # First, find the nn of all the samples to identify samples # in danger and noisy ones if self.verbose: print("Finding the {} nearest neighbours...".format(self.m)) # As usual, fit a nearest neighbour model to the data self.nearest_neighbour.fit(X) if self.verbose: print("done!") # Now, get rid of noisy support vectors noise_bool = self._in_danger_noise(support_vector, y, kind='noise') # Remove noisy support vectors support_vector = support_vector[np.logical_not(noise_bool)] danger_bool = self._in_danger_noise(support_vector, y, kind='danger') safety_bool = np.logical_not(danger_bool) if self.verbose: print("Out of {0} support vectors, {1} are noisy, " "{2} are in danger " "and {3} are safe.".format( support_vector.shape[0], noise_bool.sum().astype(int), danger_bool.sum().astype(int), safety_bool.sum().astype(int))) # Proceed to find support vectors NNs among the minority class print("Finding the {} nearest neighbours...".format(self.k)) self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour.fit(X_min) if self.verbose: print("done!") print("Creating synthetic samples...", end="") # Split the number of synthetic samples between interpolation and # extrapolation # The fraction are sampled from a beta distribution with mean # 0.5 and variance 0.01# np.random.seed(self.random_state) fractions = beta(10, 10) # Interpolate samples in danger if np.count_nonzero(danger_bool) > 0: nns = self.nearest_neighbour.kneighbors( support_vector[danger_bool], return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( support_vector[danger_bool], self.min_c_, X_min, nns, int(fractions * (num_samples + 1)), step_size=1.) # Extrapolate safe samples if np.count_nonzero(safety_bool) > 0: nns = self.nearest_neighbour.kneighbors( support_vector[safety_bool], return_distance=False)[:, 1:] X_new_2, y_new_2 = self._make_samples( support_vector[safety_bool], self.min_c_, X_min, nns, int((1 - fractions) * num_samples), step_size=-self.out_step) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) # not any support vectors in danger elif np.count_nonzero(danger_bool) == 0: X_resampled = np.concatenate((X, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_2), axis=0) # All the support vector in danger elif np.count_nonzero(safety_bool) == 0: X_resampled = np.concatenate((X, X_new_1), axis=0) y_resampled = np.concatenate((y, y_new_1), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour.set_params(**{'n_neighbors': self.m + 1}) return X_resampled, y_resampled
class KnnRecommender: """ This is an item-based collaborative filtering recommender with KNN implmented by sklearn """ def __init__(self): self.model = NearestNeighbors() self.item_user_mat_sparse, self.hashmap = self._prep_data() self.set_model_params(10, 'brute', 'cosine', -1) def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None): """ set model params for sklearn.neighbors.NearestNeighbors Parameters ---------- n_neighbors: int, optional (default = 5) algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional metric: string or callable, default 'minkowski', or one of ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] n_jobs: int or None, optional (default=None) """ if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) def _prep_data(self): """ prepare data for recommender 1. item-user scipy sparse matrix 2. hashmap of itemId to row index in item-user scipy sparse matrix """ connection = create_connection() with connection: # read data cursor = connection.cursor() cursor.execute("SELECT * FROM review") reviews = cursor.fetchall() cursor.close() columns = ['id', 'userId', 'itemId', 'rating', 'reviewTime'] df_ratings = pd.DataFrame(reviews, columns=columns) # pivot and create movie-user matrix item_user_mat = df_ratings.pivot(index='itemId', columns='userId', values='rating').fillna(0) # hashmap of itemId to row index in item-user scipy sparse matrix hashmap = {} index = 0 for i in item_user_mat.index: hashmap[index] = i index = index + 1 # transform matrix to scipy sparse matrix item_user_mat_sparse = csr_matrix(item_user_mat.values) # clean up del df_ratings, item_user_mat return item_user_mat_sparse, hashmap def _inference(self, model, data, itemId, n_recommendations): """ return top n similar item recommendations Parameters ---------- model: sklearn model, knn model data: item-user matrix itemId: id of item in matrix n_recommendations: int, top n recommendations Return ------ list of top n similar item recommendations """ # fit model.fit(data) # inference distances, indices = model.kneighbors(data[itemId], n_neighbors=n_recommendations + 1) # get list of raw idx of recommendations raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key=lambda x: x[1] )[:0:-1] # return recommendation (itemId, distance) return raw_recommends def make_recommendations(self, itemId, n_recommendations): """ make top n movie recommendations Parameters ---------- itemId: raw id of item n_recommendations: int, top n recommendations """ reverse_hashmap = {v: k for k, v in self.hashmap.items()} idx = reverse_hashmap[itemId] # get recommendation raw_recommends = self._inference(self.model, self.item_user_mat_sparse, idx, n_recommendations) # print results recommended_item_id = [] # print('Recommendations for {}:'.format(itemId)) for i, (idx, dist) in enumerate(raw_recommends): # print('{0}: {1}, with distance ' # 'of {2}'.format(i + 1, self.hashmap[idx], dist)) recommended_item_id.insert(0, self.hashmap[idx]) del raw_recommends return recommended_item_id
def networkBuildKnn( X_net, Y_net, knn=5, e_percentile=None, class_connected=False, metric="euclidean", neighbors=True, colors=[], ): g = nx.Graph() g.graph["knn"] = knn g.graph["e_percentile"] = e_percentile g.graph["class_connected"] = class_connected g.graph["metric"] = metric g.graph["neighbors"] = neighbors lnNet = len(X_net) g.graph["class_names"] = list(set(Y_net)) g.graph["colors"] = colors class_nodes = [[] for i in g.graph["class_names"]] for index, instance in enumerate(X_net): label = Y_net[index] index_label = g.graph["class_names"].index(label) class_nodes[index_label].append(str(index)) g.add_node(str(index), value=instance, type_node="net", label=label) g.graph["class_nodes"] = class_nodes values = X_net if values.ndim == 1: values = np.reshape(values, (-1, 1)) nbrs = NearestNeighbors(n_neighbors=knn + 1, metric=metric) nbrs.fit(values) distances, indices = nbrs.kneighbors(values) indices = indices[:, 1:] distances = distances[:, 1:] for indice_node, indices_node in enumerate(indices): for tmpi, indice in enumerate(indices_node): if (g.nodes()[str(indice)]["label"] == g.nodes()[str(indice_node)]["label"] or class_connected): g.add_edge(str(indice), str(indice_node), weight=distances[indice_node][tmpi]) if not e_percentile == None: eRadius = np.quantile(distances, e_percentile) nbrs.set_params(radius=eRadius) distances, indices = nbrs.radius_neighbors(values) for indice_node, indices_node in enumerate(indices): for tmpi, indice in enumerate(indices_node): if not str(indice) == str(indice_node): if (g.nodes()[str(indice)]["label"] == g.nodes()[str(indice_node)]["label"] or class_connected): g.add_edge( str(indice), str(indice_node), weight=distances[indice_node][tmpi], ) g.graph["index"] = lnNet if neighbors: g.graph["nbrs"] = nbrs return g
class KnnRecommender: def __init__(self, path_movies, path_ratings): self.path_movies = path_movies self.path_ratings = path_ratings self.movie_rating_thres = 0 self.user_rating_thres = 0 self.model = NearestNeighbors() def set_filter_params(self, movie_rating_thres, user_rating_thres): self.movie_rating_thres = movie_rating_thres self.user_rating_thres = user_rating_thres def _prep_data(self): myclient = pymongo.MongoClient("mongodb://*****:*****@self.movie_rating_thres').index)) # noqa movies_filter = ratings_df.movieId.isin(popular_movies).values df_users_cnt = pd.DataFrame(ratings_df.groupby('userId').size(), columns=['count']) active_users = list( set(df_users_cnt.query( 'count >= @self.user_rating_thres').index)) # noqa users_filter = ratings_df.userId.isin(active_users).values df_ratings_filtered = ratings_df[movies_filter & users_filter] # pivot and create movie-user matrix movie_user_mat = df_ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0) # create mapper from movie title to index hashmap = { movie: i for i, movie in enumerate( list( movies_df.set_index('movieId').loc[ movie_user_mat.index].title)) # noqa } # transform matrix to scipy sparse matrix movie_user_mat_sparse = csr_matrix(movie_user_mat.values) # clean up # del df_movies, df_movies_cnt, df_users_cnt # del df_ratings, df_ratings_filtered, movie_user_mat # gc.collect() # print("Movie user sparse matrix \n",movie_user_mat_sparse) # print("Hashmap \n",hashmap) return movie_user_mat_sparse, hashmap def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None): if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) def _fuzzy_matching(self, hashmap, fav_movie): match_tuple = [] # get match for title, idx in hashmap.items(): ratio = fuzz.ratio(title.lower(), fav_movie.lower()) if ratio >= 60: match_tuple.append((title, idx, ratio)) # sort match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1] if not match_tuple: print('Oops! No match is found') else: print('Found possible matches in our database: ' '{0}\n'.format([x[0] for x in match_tuple])) return match_tuple[0][1] def _inference(self, model, data, hashmap, fav_movie, n_recommendations): # fit if (os.path.exists('knnpickle_file')): print("Model exist") model = pickle.load(open('knnpickle_file', 'rb')) else: print("Model doesn't exist") model.fit(data) knnPickle = open('knnpickle_file', 'wb') # source, destination pickle.dump(model, knnPickle) # get input movie index print('You have input movie:', fav_movie) idx = self._fuzzy_matching(hashmap, fav_movie) # print(data[idx]) # inference print('Recommendation system start to make inference') print('......\n') t0 = time.time() distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations + 1) # get list of raw idx of recommendations raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key=lambda x: x[1] )[:0:-1] print('It took my system {:.2f}s to make inference \n\ '.format(time.time() - t0)) # return recommendation (movieId, distance) return raw_recommends def make_recommendations(self, fav_movie, n_recommendations): # get data movie_user_mat_sparse, hashmap = self._prep_data() # get recommendations raw_recommends = self._inference(self.model, movie_user_mat_sparse, hashmap, fav_movie, n_recommendations) # print(raw_recommends) # print results reverse_hashmap = {v: k for k, v in hashmap.items()} # print('Recommendations for {}:'.format(fav_movie)) tempList = [] for i, (idx, dist) in enumerate(raw_recommends): tempList.append(reverse_hashmap[idx]) # print('{0}: {1}, with distance ' # 'of {2}'.format(i+1, reverse_hashmap[idx], dist)) return tempList
class SMOTE(OverSampler): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique, and the variants Borderline SMOTE 1, 2 and SVM-SMOTE. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Whether or not to print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm'. Attributes ---------- ratio : str or float If 'auto', the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None Seed for random number generation. min_c_ : str or int The identifier of the minority class. max_c_ : str or int The identifier of the majority class. stats_c_ : dict of str/int : int A dictionary in which the number of occurences of each class is reported. X_shape_ : tuple of int Shape of the data `X` during fitting. Notes ----- See the original papers: [1]_, [2]_, [3]_ for more details. It does not support multiple classes automatically, but can be called multiple times. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning," Advances in intelligent computing, 878-887, 2005. .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for imbalanced data classification," International Journal of Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. """ def __init__(self, ratio='auto', random_state=None, verbose=True, k=5, m=10, out_step=0.5, kind='regular', n_jobs=-1, **kwargs): """Initialisation of SMOTE object. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Whether or not to print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm'. n_jobs : int, optional (default=-1) Number of threads to run the algorithm when it is possible. """ super(SMOTE, self).__init__(ratio=ratio, random_state=random_state, verbose=verbose) # --- The type of smote # This object can perform regular smote over-sampling, borderline 1, # borderline 2 and svm smote. Since the algorithms are fairly simple # they share most methods. possible_kind = ('regular', 'borderline1', 'borderline2', 'svm') if kind in possible_kind: self.kind = kind else: raise ValueError('Unknown kind for SMOTE algorithm.') self.k = k self.m = m self.out_step = out_step self.verbose = verbose self.kwargs = kwargs self.n_jobs = n_jobs # --- NN object # Import the NN object from scikit-learn library. Since in the smote # variations we must first find samples that are in danger, we # initialize the NN object differently depending on the method chosen if kind == 'regular': # Regular smote does not look for samples in danger, instead it # creates synthetic samples directly from the k-th nearest # neighbours with not filtering self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1, n_jobs=self.n_jobs) else: # Borderline1, 2 and SVM variations of smote must first look for # samples that could be considered noise and samples that live # near the boundary between the classes. Therefore, before # creating synthetic samples from the k-th nns, it first look # for m nearest neighbors to decide whether or not a sample is # noise or near the boundary. self.nearest_neighbour = NearestNeighbors(n_neighbors=self.m + 1, n_jobs=self.n_jobs) # --- SVM smote # Unlike the borderline variations, the SVM variation uses the support # vectors to decide which samples are in danger (near the boundary). # Additionally it also introduces extrapolation for samples that are # considered safe (far from boundary) and interpolation for samples # in danger (near the boundary). The level of extrapolation is # controled by the out_step. if kind == 'svm': # Store SVM object with any parameters self.svm = SVC(random_state=self.random_state, **self.kwargs) def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(SMOTE, self).fit(X, y) return self def _in_danger_noise(self, samples, y, kind='danger'): """Estimate if a set of sample are in danger or noise. Parameters ---------- samples : ndarray, shape (n_samples, n_features) The samples to check if either they are in danger or not. y : ndarray, shape (n_samples, ) The true label in order to check the neighbour labels. kind : str, optional (default='danger') The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray, shape (n_samples, ) A boolean array where True refer to samples in danger or noise. """ # Find the NN for each samples # Exclude the sample itself x = self.nearest_neighbour.kneighbors(samples, return_distance=False)[:, 1:] # Count how many NN belong to the minority class # Find the class corresponding to the label in x nn_label = (y[x] != self.min_c_).astype(int) # Compute the number of majority samples in the NN n_maj = np.sum(nn_label, axis=1) if kind == 'danger': # Samples are in danger for m/2 <= m' < m return np.bitwise_and(n_maj >= float(self.m) / 2., n_maj < self.m) elif kind == 'noise': # Samples are noise for m = m' return n_maj == self.m else: raise NotImplementedError def _make_samples(self, X, y_type, nn_data, nn_num, n_samples, step_size=1.): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : ndarray, shape (n_samples, n_features) Points from which the points will be created. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray, shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in nn_data. n_samples : int The number of samples to generate. step_size : float, optional (default=1.) The step size to create samples. Returns ------- X_new : ndarray, shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray, shape (n_samples_new, ) Target values for synthetic samples. """ # Check the consistency of X X = check_array(X) # A matrix to store the synthetic samples X_new = np.zeros((n_samples, X.shape[1])) # Set seeds np.random.seed(self.random_state) seeds = np.random.randint(low=0, high=100*len(nn_num.flatten()), size=n_samples) # Randomly pick samples to construct neighbours from np.random.seed(self.random_state) samples = np.random.randint(low=0, high=len(nn_num.flatten()), size=n_samples) # Loop over the NN matrix and create new samples for i, n in enumerate(samples): # NN lines relate to original sample, columns to its # nearest neighbours row, col = divmod(n, nn_num.shape[1]) # Take a step of random size (0,1) in the direction of the # n nearest neighbours if self.random_state is None: np.random.seed(seeds[i]) else: np.random.seed(self.random_state) step = step_size * np.random.uniform() # Construct synthetic sample X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) # The returned target vector is simply a repetition of the # minority label y_new = np.array([y_type] * len(X_new)) if self.verbose: print("Generated {} new samples ...".format(len(X_new))) return X_new, y_new def sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(SMOTE, self).sample(X, y) # Define the number of sample to create # We handle only two classes problem for the moment. if self.ratio == 'auto': num_samples = (self.stats_c_[self.maj_c_] - self.stats_c_[self.min_c_]) else: num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) - self.stats_c_[self.min_c_]) # Start by separating minority class features and target values. X_min = X[y == self.min_c_] # If regular SMOTE is to be performed if self.kind == 'regular': # Print if verbose is true# if self.verbose: print('Finding the {} nearest neighbours...'.format(self.k)) # Look for k-th nearest neighbours, excluding, of course, the # point itself. self.nearest_neighbour.fit(X_min) # Matrix with k-th nearest neighbours indexes for each minority # element. nns = self.nearest_neighbour.kneighbors( X_min, return_distance=False)[:, 1:] # Print status if verbose is true if self.verbose: print("done!") print("Creating synthetic samples...", end="") # --- Generating synthetic samples # Use static method make_samples to generate minority samples X_new, y_new = self._make_samples(X_min, self.min_c_, X_min, nns, num_samples, 1.0) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set X_resampled = np.concatenate((X, X_new), axis=0) y_resampled = np.concatenate((y, y_new), axis=0) return X_resampled, y_resampled if self.kind == 'borderline1' or self.kind == 'borderline2': if self.verbose: print("Finding the {} nearest neighbours...".format(self.m)) # Find the NNs for all samples in the data set. self.nearest_neighbour.fit(X) if self.verbose: print("done!") # Boolean array with True for minority samples in danger danger_index = self._in_danger_noise(X_min, y, kind='danger') # If all minority samples are safe, return the original data set. if not any(danger_index): if self.verbose: print('There are no samples in danger. No borderline ' 'synthetic samples created.') # All are safe, nothing to be done here. return X, y # If we got here is because some samples are in danger, we need to # find the NNs among the minority class to create the new synthetic # samples. # # We start by changing the number of NNs to consider from m + 1 # to k + 1 self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour.fit(X_min) # nns...# nns = self.nearest_neighbour.kneighbors( X_min[danger_index], return_distance=False)[:, 1:] # B1 and B2 types diverge here!!! if self.kind == 'borderline1': # Create synthetic samples for borderline points. X_new, y_new = self._make_samples(X_min[danger_index], self.min_c_, X_min, nns, num_samples) # Concatenate the newly generated samples to the original # dataset X_resampled = np.concatenate((X, X_new), axis=0) y_resampled = np.concatenate((y, y_new), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour.set_params(**{'n_neighbors': self.m+1}) return X_resampled, y_resampled else: # Split the number of synthetic samples between only minority # (type 1), or minority and majority (with reduced step size) # (type 2). np.random.seed(self.random_state) # The fraction is sampled from a beta distribution centered # around 0.5 with variance ~0.01 fractions = beta(10, 10) # Only minority X_new_1, y_new_1 = self._make_samples(X_min[danger_index], self.min_c_, X_min, nns, int(fractions * (num_samples + 1)), step_size=1.) # Only majority with smaller step size X_new_2, y_new_2 = self._make_samples(X_min[danger_index], self.min_c_, X[y != self.min_c_], nns, int((1 - fractions) * num_samples), step_size=0.5) # Concatenate the newly generated samples to the original # data set X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour.set_params(**{'n_neighbors': self.m+1}) return X_resampled, y_resampled if self.kind == 'svm': # The SVM smote model fits a support vector machine # classifier to the data and uses the support vector to # provide a notion of boundary. Unlike regular smote, where # such notion relies on proportion of nearest neighbours # belonging to each class. # Fit SVM to the full data# self.svm.fit(X, y) # Find the support vectors and their corresponding indexes support_index = self.svm.support_[y[self.svm.support_] == self.min_c_] support_vector = X[support_index] # First, find the nn of all the samples to identify samples # in danger and noisy ones if self.verbose: print("Finding the {} nearest neighbours...".format(self.m)) # As usual, fit a nearest neighbour model to the data self.nearest_neighbour.fit(X) if self.verbose: print("done!") # Now, get rid of noisy support vectors noise_bool = self._in_danger_noise(support_vector, y, kind='noise') # Remove noisy support vectors support_vector = support_vector[np.logical_not(noise_bool)] danger_bool = self._in_danger_noise(support_vector, y, kind='danger') safety_bool = np.logical_not(danger_bool) if self.verbose: print("Out of {0} support vectors, {1} are noisy, " "{2} are in danger " "and {3} are safe.".format(support_vector.shape[0], noise_bool.sum().astype(int), danger_bool.sum().astype(int), safety_bool.sum().astype(int) )) # Proceed to find support vectors NNs among the minority class print("Finding the {} nearest neighbours...".format(self.k)) self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour.fit(X_min) if self.verbose: print("done!") print("Creating synthetic samples...", end="") # Split the number of synthetic samples between interpolation and # extrapolation # The fraction are sampled from a beta distribution with mean # 0.5 and variance 0.01# np.random.seed(self.random_state) fractions = beta(10, 10) # Interpolate samples in danger if np.count_nonzero(danger_bool) > 0: nns = self.nearest_neighbour.kneighbors( support_vector[danger_bool], return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( support_vector[danger_bool], self.min_c_, X_min, nns, int(fractions * (num_samples + 1)), step_size=1.) # Extrapolate safe samples if np.count_nonzero(safety_bool) > 0: nns = self.nearest_neighbour.kneighbors( support_vector[safety_bool], return_distance=False)[:, 1:] X_new_2, y_new_2 = self._make_samples( support_vector[safety_bool], self.min_c_, X_min, nns, int((1 - fractions) * num_samples), step_size=-self.out_step) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) # not any support vectors in danger elif np.count_nonzero(danger_bool) == 0: X_resampled = np.concatenate((X, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_2), axis=0) # All the support vector in danger elif np.count_nonzero(safety_bool) == 0: X_resampled = np.concatenate((X, X_new_1), axis=0) y_resampled = np.concatenate((y, y_new_1), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour.set_params(**{'n_neighbors': self.m+1}) return X_resampled, y_resampled
class KNNRegressor: def __init__(self, k, strategy='my_own', metric='euclidean', mode='uniform'): if not isinstance(k, int) or k < 1: raise AttributeError('Incorrect "k" parameter') if not isinstance(mode, str) or mode != 'uniform' and mode != 'distance': raise AttributeError('Mode parameter can be uniform or distance only') self.mode = mode self.k = k self.strategy = strategy self.metric = metric self.training_labels = None if strategy == 'my_own': self.training_data = None else: self.nn = NearestNeighbors(n_neighbors=k, algorithm=strategy, leaf_size=30, metric=metric) def fit(self, x, y): if x.shape[0] != y.shape[0]: raise AttributeError('Mismatch between training set and its labels') self.training_labels = y if self.strategy == 'my_own': self.training_data = x else: self.nn.fit(x) def find_kneighbors(self, x, return_distance=True): if not isinstance(return_distance, bool): raise AttributeError('Incorrect "return_distance" parameter') if self.strategy == 'my_own': if self.metric == 'euclidean': dist_matrix = distances.euclidean_distance(x, self.training_data) elif self.metric == 'cosine': dist_matrix = distances.cosine_distance(x, self.training_data) else: dist_matrix = self.metric(self.training_data, x).astype(np.float64).T if not return_distance: res_index = np.empty(dist_matrix.shape[0], dtype=np.int64) tmp_index = np.empty(dist_matrix.shape[0], dtype=np.int64) np.argmin(dist_matrix, axis=1, out=res_index) dist_matrix[np.arange(dist_matrix.shape[0]), res_index] = np.inf res_index = res_index.reshape((-1, 1)) for i in range(self.k - 1): np.argmin(dist_matrix, axis=1, out=tmp_index) dist_matrix[np.arange(dist_matrix.shape[0]), tmp_index] = np.inf res_index = np.hstack((res_index, tmp_index[:, np.newaxis])) return res_index else: res_index = np.empty(dist_matrix.shape[0], dtype=np.int64) tmp_index = np.empty(dist_matrix.shape[0], dtype=np.int64) np.argmin(dist_matrix, axis=1, out=res_index) res_dist = dist_matrix[np.arange(dist_matrix.shape[0]), res_index] dist_matrix[np.arange(dist_matrix.shape[0]), res_index] = np.inf res_index = res_index.reshape((-1, 1)) res_dist = res_dist.reshape((-1, 1)) for i in range(self.k - 1): np.argmin(dist_matrix, axis=1, out=tmp_index) res_dist = np.hstack((res_dist, dist_matrix[np.arange(dist_matrix.shape[0]), tmp_index][:, np.newaxis])) dist_matrix[np.arange(dist_matrix.shape[0]), tmp_index] = np.inf res_index = np.hstack((res_index, tmp_index[:, np.newaxis])) return res_dist, res_index else: return self.nn.kneighbors(x, return_distance=return_distance) def predict(self, x, k=None): if k is not None: if not isinstance(k, int) or k < 1: raise AttributeError('Incorrect "k" parameter') else: if self.strategy == 'my_own': self.k = k else: params = self.nn.get_params() params['n_neighbors'] = k self.nn = self.nn.set_params(**params) if self.mode == 'uniform': nn_index = self.training_labels[self.find_kneighbors(x, return_distance=False)] return np.mean(nn_index, axis=1) else: vec_weight = np.vectorize(lambda z: 1 / (z + 0.00001)) nn_dist, nn_index = self.find_kneighbors(x) nn_index = self.training_labels[nn_index] nn_dist = vec_weight(nn_dist) return np.sum(nn_index * nn_dist, axis=1) / np.sum(nn_dist, axis=1)
class KnnRecommender: """ This is an item-based collaborative filtering recommender with KNN implmented by sklearn """ def __init__(self, train_file, test_file, do_five_fold_cs): self.train_file = train_file self.test_file = test_file self.do_five_fold_cs = do_five_fold_cs self.model = NearestNeighbors() def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None): if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) def make_recommendations(self, n_recommendations): # get data data_train, data_test, hashmap, _ = du.read_train_and_val_data_to_index( self.train_file, self.test_file) train_data_one_hot = du.to_one_hot_train(data_train, len(hashmap)) self.model.fit(train_data_one_hot) test_one_hot_gen = du.to_one_hot_with_gt_generator( data_test, len(hashmap), False) correct = 0 for i in range(len(data_test)): test_case, gt = next(test_one_hot_gen) test_case = test_case.reshape((1, -1)) distances, indices = self.model.kneighbors(test_case, n_neighbors=100) distances = distances.flatten() indices = indices.flatten() test_case = test_case.flatten().astype(np.float) icd_pred = [] case_pred = np.zeros_like(test_case) for j, idx in enumerate(indices): case_pred += ( (train_data_one_hot[idx, :].astype(np.float) - test_case) / distances[j]) pred_idx = case_pred.argsort()[-5:][::-1] for idx in pred_idx: icd_pred.append(hashmap[idx]) gt_idx = np.array(np.where(gt == 1)).item(0) gt_icd = hashmap[gt_idx] c = False if gt_icd in icd_pred: c = True correct += 1 print( str(i) + " Predicted: " + str(icd_pred) + " GT: " + gt_icd + " " + str(c) + " " + str(float(correct) / float(i + 1))) print("Top 5 Acc: " + str(float(correct) / float(len(data_test))))
class KnnClass: def __init__(self, movies_path, ratings_path): # Path for movie csv containing movies data self.movies_path = movies_path # Path for ratings csv containing ratings data self.ratings_path = ratings_path # Movie rating (0-5) self.movie_rating_thres = 0 # # of User raiting for a movie self.user_rating_thres = 0 self.model = NearestNeighbors() # Create t0 to calculate estimated finish time self.t0 = 0 def SetFilterParams(self, movie_rating_thres, user_rating_thres): # Set movie and user rating frequency threshold self.movie_rating_thres = movie_rating_thres self.user_rating_thres = user_rating_thres # Start the timer self.t0 = time.time() def SetModelParams(self, n_neighbors, algorithm, metric, jobs=None): # Setting up the model parameters for the sklearn NearestNeighbors if jobs and (jobs > 1 or jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': jobs }) def PrepareData(self): ### Prepate the data for the recommender # Read the data from movies csv movies = pd.read_csv(os.path.join(self.movies_path), usecols=['movieId', 'title'], dtype={ 'movieId': 'int32', 'title': 'str' }) ratings = pd.read_csv(os.path.join(self.ratings_path), usecols=['userId', 'movieId', 'rating'], dtype={ 'userId': 'int32', 'movieId': 'int32', 'rating': 'float32' }) # Filter the data movies_count = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count']) popular_movies = list( set(movies_count.query('count >= @self.movie_rating_thres').index)) movies_filter = ratings.movieId.isin(popular_movies).values users_count = pd.DataFrame(ratings.groupby('userId').size(), columns=['count']) active_users = list( set(users_count.query('count >= @self.user_rating_thres').index)) users_filter = ratings.userId.isin(active_users).values ratings_filtered = ratings[movies_filter & users_filter] # Move pivot and create movie/user matrix movie_user_mat = ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0) # Create mapper from movie title to index hashmap = { movie: i for i, movie in enumerate( list( movies.set_index('movieId').loc[ movie_user_mat.index].title)) } # Transform matrix to scipy sparse matrix movie_user_mat_sparse = csr_matrix(movie_user_mat.values) # Cleam the memory del movies, movies_count, users_count del ratings, ratings_filtered, movie_user_mat gc.collect() return movie_user_mat_sparse, hashmap def FindMovieMatch(self, hashmap, user_move_input): ### Transform the movie name inputted by the user to lower case ### Map movie title name to index of the movie in data ### And use the fuzz library ratio function to find a match match = [] # get match for move_title, index in hashmap.items(): ratio = fuzz.ratio(move_title.lower(), user_move_input.lower()) if ratio >= 60: match.append((move_title, index, ratio)) # sort match = sorted(match, key=lambda x: x[2])[::-1] if not match: print('No match is found') else: print('Found matches in our database: ' '{0}\n'.format([x[0] for x in match])) return match[0][1] def FindData(self, model, data, hashmap, movie_chosen, n_recommendations): ### Return top movies that are similar to the user's movie input # Fit the data to our model model.fit(data) # Get movie index print('You have input movie:', movie_chosen) index = self.FindMovieMatch(hashmap, movie_chosen) # FindData print('Finding movies..') print('......\n') distances, indices = model.kneighbors(data[index], n_neighbors=n_recommendations + 1) # Get list of raw index of recommendations raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key=lambda x: x[1] )[:0:-1] self.timeNeeded = 'It took {:.2f}s to finish \n\ '.format(time.time() - self.t0) # return recommendation (movieId, distance) return raw_recommends def Recommend(self, movie_chosen, recommendations_count): # Prepare the data, load the sparse matrix and the hashmap movieUserMatSparse, hashmap = self.PrepareData() # Find recommendations rawRecommends = self.FindData(self.model, movieUserMatSparse, hashmap, movie_chosen, recommendations_count) # Create the return string of the recommended movies recommended_movies = "" # Print the results reversed_hashmap = {v: k for k, v in hashmap.items()} print('Recommendations for {}:'.format(movie_chosen)) for i, (index, dist) in enumerate(rawRecommends): print('{0}: {1}, with distance ' 'of {2}'.format(recommendations_count - i, reversed_hashmap[index], dist)) recommended_movies = '{0}: {1}'.format( recommendations_count - i, reversed_hashmap[index]) + "\n" + recommended_movies return recommended_movies + '\n\n' + str(self.timeNeeded)
class KnnRecommender: def __init__(self, path_movies, path_ratings): self.path_movies = path_movies self.path_ratings = path_ratings self.movie_rating_thres = 0 self.user_rating_thres = 0 self.model = NearestNeighbors() def _prep_data(self): df_movies = pd.read_csv('movies.csv', usecols=['movieId', 'title'], dtype={ 'movieId': 'int32', 'title': 'str' }) df_movies = pd.read_csv('movies.csv', usecols=['movieId', 'title'], dtype={ 'movieId': 'int32', 'title': 'str' }) df_ratings = pd.read_csv('ratings.csv', usecols=['userId', 'movieId', 'rating'], dtype={ 'userId': 'int32', 'movieId': 'int32', 'rating': 'float32' }) # filter data # conta o numero de avaliações do filme df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count']) popular_movies = list( set( df_movies_cnt.query( 'count >= @self.movie_rating_thres').index)) # noqa movies_filter = df_ratings.movieId.isin(popular_movies).values df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(), columns=['count']) active_users = list( set(df_users_cnt.query( 'count >= @self.user_rating_thres').index)) # noqa users_filter = df_ratings.userId.isin(active_users).values df_ratings_filtered = df_ratings[movies_filter & users_filter] # pivot and create movie-user matrix movie_user_mat = df_ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0) # create mapper from movie title to index hashmap = { movie: i for i, movie in enumerate( list( df_movies.set_index('movieId').loc[ movie_user_mat.index].title)) # noqa } # transform matrix to scipy sparse matrix movie_user_mat_sparse = csr_matrix(movie_user_mat.values) # clean up del df_movies, df_movies_cnt, df_users_cnt del df_ratings, df_ratings_filtered, movie_user_mat gc.collect() return movie_user_mat_sparse, hashmap def set_filter_params(self, movie_rating_thres, user_rating_thres): """ defina o limite de frequência de classificação para filtrar filmes menos conhecidos e usuários menos ativos Parameters ---------- movie_rating_thres: int, número mínimo de classificações recebidas pelos usuários user_rating_thres: int, número mínimo de classificações que um usuário fornece """ self.movie_rating_thres = movie_rating_thres self.user_rating_thres = user_rating_thres def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None): if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) def _fuzzy_matching(self, hashmap, fav_movie): # print("\nHASHMAP") # print(hashmap) match_tuple = [] # get match for title, idx in hashmap.items(): ratio = fuzz.ratio(title.lower(), fav_movie.lower()) if ratio >= 60: match_tuple.append((title, idx, ratio)) # sort match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1] if not match_tuple: print('Oops! No match is found') else: print( 'Foram encontradas possíveis correspondências em nosso banco de dados: ' '{0}\n'.format([x[0] for x in match_tuple])) return match_tuple[0][1] def _inference(self, model, data, hashmap, fav_movie, n_recommendations): # fit # print("\nAQUI É DATA") # print(data) # print("\n") model.fit(data) # get input movie index print('You have input movie:', fav_movie) idx = self._fuzzy_matching(hashmap, fav_movie) print("\nIDX") print(idx) # inference print('Sistema de recomendação começa a fazer inferência') print('......\n') t0 = time.time() distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations + 1) # print("\nINDICES") # print(indices) # get list of raw idx of recommendations raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key=lambda x: x[1] )[:0:-1] print('O meu sistema {: .2f} s fez inferência \n\ '.format(time.time() - t0)) # print('\nRAW') # print(raw_recommends) # return recommendation (movieId, distance) return raw_recommends def make_recommendations(self, fav_movie, n_recommendations): filmesRecomendados = [] # get data movie_user_mat_sparse, hashmap = self._prep_data() # get recommendations raw_recommends = self._inference(self.model, movie_user_mat_sparse, hashmap, fav_movie, n_recommendations) # print results # print(hashmap) reverse_hashmap = {v: k for k, v in hashmap.items()} # print('Recomendação for {}:'.format(fav_movie)) for i, (idx, dist) in enumerate(raw_recommends): # print('{0}: {1}, with distance ' # 'of {2}'.format(i+1, reverse_hashmap[idx], dist)) filmesRecomendados.append(reverse_hashmap[idx]) # print("\nREVERSE") # print(reverse_hashmap) return filmesRecomendados
class RecommendationSystem: """ item-based collaborative filtering for movie-lens dataset using sklearn's nearest neighbors """ def __init__(self,movie_th,user_th,movie_path,ratings_path,users_path): """ Description: 1. Initialization of threshold values 2. Initialization of file paths 3. Initialization of models """ self.movie_th = movie_th self.user_th = user_th self.movies_path = movie_path self.ratings_path = ratings_path self.path = pathlib.Path().absolute() / 'ml-1m' self.users_path = users_path self.model = NearestNeighbors() self.data_matrix = None self.dic_movie_name = None self.dic_movie_id = None def define_model_parameters(self,n_neighbors,algorithm,metric,jobs=1): #print(n_neighbors,algorithm,metric,jobs) """ Args: n_neighbors : number of neighbors used for calculating similarity algorithm : type of algorithm used i.e brute force, KDTRee, BallTree for building model metric : how similarity is being measured i.e cosine, l1,l2 norm jobs : number of processors to be used in parallel """ self.model.set_params(**{ 'n_neighbors':n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs' : jobs }) def read_clean_prepare(self): self.ratings = pd.read_csv( self.path/self.ratings_path, sep='::', names=['userId','movieId','rating','timestamp'], usecols = ['userId','movieId','rating'], dtype= {'userId':np.int32,'movieId':np.int32,'rating':np.float32} ) self.movies = pd.read_csv( self.path/self.movies_path, sep='::', names=['movieId','title','genres'], usecols = ['movieId','title'], dtype= {'movieId':np.int32,'title':str} ) self.movies.title = self.movies.title.str.lower() self.movies['name'] = self.movies.title.str.replace(r'\(\d{4}\)',"") self.movies.name = self.movies.name.str.rstrip() self.dic_movie_name = dict([(row.movieId,row.title) for row in self.movies.itertuples()]) self.dic_movie_id = dict([(row.name,row.movieId) for row in self.movies.itertuples()]) temp = self.ratings.groupby('movieId').agg({'userId':len}) self.pop_movies = temp[temp['userId']>=recommender.movie_th].index self.data_matrix = self.ratings.pivot(index = 'movieId',columns = 'userId', values = 'rating').fillna(0) self.data_matrix = csr_matrix(self.data_matrix) time.sleep(3) self.define_model_parameters(20,'brute','cosine') self.model.fit(self.data_matrix) def plot_rating_freq(self): temp = self.ratings.groupby('movieId').agg({'userId':len}) #temp = temp.reset_index() temp = temp.sort_values(by=['userId'],ascending=False) temp.index = pd.RangeIndex(0,temp.shape[0]) plt.plot(temp.index,temp['userId']) plt.title('movie rating frequency') plt.xlabel("movies") plt.ylabel("number of ratings") def recommend(self,fav,how_many): """ Args: fav: name of favorite movie how_many : number of similar movies to be recommended Returns: Exits if the movie not in database else returns the name of the movies. """ if fav not in self.dic_movie_id: print("Movie not found in the database\n") exit() fav_id = self.dic_movie_id[fav] _,recommendations = self.model.kneighbors(self.data_matrix[fav_id],n_neighbors=how_many+1) recommendations = [self.dic_movie_name[x] for x in recommendations[0][1:]] #print(recommendations) print("Top {} recommendations are:\n".format(how_many)) for i in range(how_many): print("{}. {}".format(i+1,recommendations[i])) return
class KnnRecommender: """ This is an item-based collaborative filtering recommender with KNN implmented by sklearn """ #定义初始方法 path_movies, path_ratings等参数传入 def __init__(self, path_movies, path_ratings): self.path_movies = path_movies self.path_ratings = path_ratings self.movie_rating_thres = 0 self.user_rating_thres = 0 #这一步的操作就直接把model定义成了 NearestNeighbors类型,下面的model已经转换 self.model = NearestNeighbors() self.train_data, self.test_data = self._prep_data() def set_filter_params(self, movie_rating_thres, user_rating_thres): """ movie_rating_thres: int, minimum number of ratings received by users user_rating_thres: int, minimum number of ratings a user gives """ #设置额定频率阈值以过滤不知名的电影(考虑前25%的电影,并且作为阈值)和不太活跃的用户(用户限制在前40%), self.movie_rating_thres = movie_rating_thres self.user_rating_thres = user_rating_thres def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None): """ 这一步作用: 将NearestNeighbors类初始化为model_knn并将稀疏矩阵适合该实例。 通过指定metric = cosine,模型将通过使用余弦相似度来测量艺术家矢量之间的相似度。 设置sklearn.neighbors.NearestNeighbors的模型参数,参量 ---------- n_neighbors: int, optional (default = 5) algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional metric: string or callable, default 'minkowski', or one of ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] n_jobs: int or None, optional (default=None) """ if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) #装载数据 def _prep_data(self): """ prepare data for recommender 1. movie-user scipy sparse matrix 2. hashmap of movie to row index in movie-user scipy sparse matrix """ # 读入数据,表之间的连接和处理 df_movies = pd.read_csv(os.path.join(self.path_movies), usecols=['movieId', 'title'], dtype={ 'movieId': 'int32', 'title': 'str' }) df_ratings = pd.read_csv(os.path.join(self.path_ratings), usecols=['userId', 'movieId', 'rating'], dtype={ 'userId': 'int32', 'movieId': 'int32', 'rating': 'float32' }) # filter data df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count']) popular_movies = list( set( df_movies_cnt.query( 'count >= @self.movie_rating_thres').index)) # noqa movies_filter = df_ratings.movieId.isin(popular_movies).values df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(), columns=['count']) active_users = list( set(df_users_cnt.query( 'count >= @self.user_rating_thres').index)) # noqa users_filter = df_ratings.userId.isin(active_users).values df_ratings_filtered = df_ratings[movies_filter & users_filter] # pivot and create movie-user matrix movie_user_mat = df_ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0) # create mapper from movie title to index ''' hashmap = { movie: i for i, movie in enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)) # noqa } # transform matrix to scipy sparse matrix 将矩阵转换为稀疏矩阵 ''' # 这里开始把上面的数据转化为稀疏矩阵,由于将要执行线性代数运算 movie_user_mat_sparse = csr_matrix(movie_user_mat.values) # clean up del df_movies, df_movies_cnt, df_users_cnt del df_ratings, df_ratings_filtered, movie_user_mat gc.collect() #train和test数据集的建立 数据的分成 train70% 和 test30% random隨機選一下 train_data, test_data = movie_user_mat_sparse.randomSplit([0.7, 0.3]) # 注意:上面movie_user_mat_sparse已经转化成了矩阵,这里train和test依然是矩阵 return train_data, test_data #添加代码 #定义交叉验证 def Cross_validation(self): #数据使用train70%,执行交叉验证 #为了直接找到最优的 n_neighbors 也就是 K值,直接使用 GridSearchCV 这个超调参数 #GridSearchCV的工作原理是在我们指定的参数范围内多次训练我们的模型。这样,我们可以用每个参数来测试我们的模型, # 并找出最优值,以获得最佳的精度结果。超调参数找到您模型的最佳参数以提高准确性 #create new a knn model knn2 = KNeighborsClassifier() #create a dictionary of all values we want to test for n_neighbors param_grid = {'n_neighbors': np.arange(1, 25)} #use gridsearch to test all values for n_neighbors #我们使用网格搜索的新模型将采用新的k-NN分类器,即param_grid和交叉验证值5, # 以便找到“ n_neighbors”的最佳值 knn_gscv = GridSearchCV(knn2, param_grid, cv=5) # fit model to data # 這裡的X,Y 从上面的train_data,使用scikit_learn的方法得到 X, Y = train_test_split(self.train_data, test_size=0.2, random_state=1, stratify=y) # fit model to data knn_gscv.fit(X, Y) #check which of our values for ‘n_neighbors’ that we tested performed the best. # knn_gscv.best_params_会返回一个字典格式的数据{n_nerghbors: 14} # 最合适的K,就是個字典的value值 k = knn_gscv.best_params_.value return k #再添加两个方法 def accuracy(self): # 下面这一步: # check mean score for the top performing value of n_neighbors # best_score_’输出通过交叉验证获得的分数的平均准确性 knn2 = KNeighborsClassifier() param_grid = {'n_neighbors': np.arange(1, 25)} knn_gscv = GridSearchCV(knn2, param_grid, cv=5) knn_gscv.fit(self.train_data, self.test_data) # 下面这一步: # check mean score for the top performing value of n_neighbors # best_score_’输出通过交叉验证获得的分数的平均准确性 accuracy = knn_gscv.best_score_ return accuracy #下面开始做电影推荐,同等结构直接转化为目前项目,家具的推荐,有待修改相应的条件和参数 def _fuzzy_matching(self, hashmap, fav_movie): """ return the closest match via fuzzy ratio. If no match found, return None Parameters ---------- hashmap: dict, map movie title name to index of the movie in data fav_movie: str, name of user input movie Return ------ index of the closest match """ match_tuple = [] # get match for title, idx in hashmap.items(): ratio = fuzz.ratio(title.lower(), fav_movie.lower()) if ratio >= 60: match_tuple.append((title, idx, ratio)) # sort match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1] if not match_tuple: print('Oops! No match is found') else: print('Found possible matches in our database: ' '{0}\n'.format([x[0] for x in match_tuple])) return match_tuple[0][1] def _inference(self, model, data, hashmap, fav_movie, n_recommendations): """ return top n similar movie recommendations based on user's input movie Parameters 根据用户输入的电影返回前n个相似的电影推荐参量 ---------- model: sklearn model, knn model data: movie-user matrix hashmap: dict, map movie title name to index of the movie in data fav_movie: str, name of user input movie n_recommendations: int, top n recommendations Return ------ list of top n similar movie recommendations """ # fit model.fit(data) # get input movie index print('You have input movie:', fav_movie) idx = self._fuzzy_matching(hashmap, fav_movie) # inference print('Recommendation system start to make inference') print('......\n') t0 = time.time() distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations + 1) # get list of raw idx of recommendations raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key=lambda x: x[1] )[:0:-1] print('It took my system {:.2f}s to make inference \n\ '.format(time.time() - t0)) # return recommendation (movieId, distance) return raw_recommends def make_recommendations(self, fav_movie, n_recommendations): """ make top n movie recommendations Parameters ---------- fav_movie: str, name of user input movie n_recommendations: int, top n recommendations """ # get data movie_user_mat_sparse, hashmap = self._prep_data() # get recommendations raw_recommends = self._inference(self.model, movie_user_mat_sparse, hashmap, fav_movie, n_recommendations) # print results reverse_hashmap = {v: k for k, v in hashmap.items()} print('Recommendations for {}:'.format(fav_movie)) for i, (idx, dist) in enumerate(raw_recommends): print('{0}: {1}, with distance ' 'of {2}'.format(i + 1, reverse_hashmap[idx], dist))
class SMOTE(UnbalancedDataset): """ This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and SVM-SMOTE. See the original papers: [1], [2], [3] for more details. * It does not support multiple classes automatically, but can be called multiple times """ def __init__(self, k=5, m=10, out_step=0.5, ratio=1, random_state=None, kind='regular', verbose=False, **kwargs): """ SMOTE over sampling algorithm and variations. Choose one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' :param k: Number of nearest neighbours to used to construct synthetic samples. :param m: The number of nearest neighbours to use to determine if a minority sample is in danger. :param out_step: Step size when extrapolating :param ratio: Fraction of the number of minority samples to synthetically generate. :param random_state: Seed for random number generation :param kind: The type of smote algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' :param verbose: Whether or not to print status information :param kwargs: Additional arguments passed to sklearn SVC object """ # Parent class methods UnbalancedDataset.__init__(self, ratio=ratio, random_state=random_state) # --- The type of smote # This object can perform regular smote over-sampling, borderline 1, # borderline 2 and svm smote. Since the algorithms are fairly simple # they share most methods.# self.kind = kind # --- Verbose # Control whether or not status and progress information should be# self.verbose = verbose # --- Nearest Neighbours for synthetic samples # The smote algorithm uses the k-th nearest neighbours of a minority # sample to generate new synthetic samples.# self.k = k # --- NN object # Import the NN object from scikit-learn library. Since in the smote # variations we must first find samples that are in danger, we # initialize the NN object differently depending on the method chosen# from sklearn.neighbors import NearestNeighbors if kind == 'regular': # Regular smote does not look for samples in danger, instead it # creates synthetic samples directly from the k-th nearest # neighbours with not filtering# self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1) else: # Borderline1, 2 and SVM variations of smote must first look for # samples that could be considered noise and samples that live # near the boundary between the classes. Therefore, before # creating synthetic samples from the k-th nns, it first look # for m nearest neighbors to decide whether or not a sample is # noise or near the boundary.# self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1) # --- Nearest Neighbours for noise and boundary (in danger) # Before creating synthetic samples we must first decide if # a given entry is noise or in danger. We use m nns in this step# self.m = m # --- SVM smote # Unlike the borderline variations, the SVM variation uses the support # vectors to decide which samples are in danger (near the boundary). # Additionally it also introduces extrapolation for samples that are # considered safe (far from boundary) and interpolation for samples # in danger (near the boundary). The level of extrapolation is # controled by the out_step.# if kind == 'svm': # As usual, use scikit-learn object# from sklearn.svm import SVC # Store extrapolation size# self.out_step = out_step # Store SVM object with any parameters# self.svm_ = SVC(**kwargs) def resample(self): """ Main method of all children classes. :return: Over-sampled data set. """ # Start by separating minority class features and target values. minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # If regular SMOTE is to be performed# if self.kind == 'regular': # Print if verbose is true# if self.verbose: print("Finding the %i nearest neighbours..." % self.k, end="") # Look for k-th nearest neighbours, excluding, of course, the # point itself.# self.nearest_neighbour_.fit(minx) # Matrix with k-th nearest neighbours indexes for each minority # element.# nns = self.nearest_neighbour_.kneighbors(minx, return_distance=False)[:, 1:] # Print status if verbose is true# if self.verbose: ## print("done!") # Creating synthetic samples # print("Creating synthetic samples...", end="") # --- Generating synthetic samples # Use static method make_samples to generate minority samples # FIX THIS SHIT!!!# sx, sy = self.make_samples(x=minx, nn_data=minx, y_type=self.minc, nn_num=nns, n_samples=int(self.ratio * len(miny)), step_size=1.0, random_state=self.rs, verbose=self.verbose) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) return ret_x, ret_y if (self.kind == 'borderline1') or (self.kind == 'borderline2'): if self.verbose: print("Finding the %i nearest neighbours..." % self.m, end="") # Find the NNs for all samples in the data set. self.nearest_neighbour_.fit(self.x) if self.verbose: print("done!") # Boolean array with True for minority samples in danger danger_index = [self.in_danger(x, self.y, self.m, miny[0], self.nearest_neighbour_) for x in minx] # Turn into numpy array# danger_index = asarray(danger_index) # If all minority samples are safe, return the original data set. if not any(danger_index): ## if self.verbose: print('There are no samples in danger. No borderline ' 'synthetic samples created.') # All are safe, nothing to be done here.# return self.x, self.y # If we got here is because some samples are in danger, we need to # find the NNs among the minority class to create the new synthetic # samples. # # We start by changing the number of NNs to consider from m + 1 # to k + 1 self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(minx) # nns...# nns = self.nearest_neighbour_.kneighbors(minx[danger_index], return_distance=False)[:, 1:] # B1 and B2 types diverge here!!! if self.kind == 'borderline1': # Create synthetic samples for borderline points. sx, sy = self.make_samples(minx[danger_index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs, verbose=self.verbose) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) return ret_x, ret_y else: # Split the number of synthetic samples between only minority # (type 1), or minority and majority (with reduced step size) # (type 2). np.random.seed(self.rs) # The fraction is sampled from a beta distribution centered # around 0.5 with variance ~0.01# fractions = betavariate(alpha=10, beta=10) # Only minority sx1, sy1 = self.make_samples(minx[danger_index], minx, self.minc, nns, fractions * (int(self.ratio * len(miny)) + 1), step_size=1, random_state=self.rs, verbose=self.verbose) # Only majority with smaller step size sx2, sy2 = self.make_samples(minx[danger_index], self.x[self.y != self.minc], self.minc, nns, (1 - fractions) * int(self.ratio * len(miny)), step_size=0.5, random_state=self.rs, verbose=self.verbose) # Concatenate the newly generated samples to the original data set ret_x = np.concatenate((self.x, sx1, sx2), axis=0) ret_y = np.concatenate((self.y, sy1, sy2), axis=0) return ret_x, ret_y if self.kind == 'svm': # The SVM smote model fits a support vector machine # classifier to the data and uses the support vector to # provide a notion of boundary. Unlike regular smote, where # such notion relies on proportion of nearest neighbours # belonging to each class.# # Fit SVM to the full data# self.svm_.fit(self.x, self.y) # Find the support vectors and their corresponding indexes support_index = self.svm_.support_[self.y[self.svm_.support_] == self.minc] support_vector = self.x[support_index] # First, find the nn of all the samples to identify samples in danger # and noisy ones if self.verbose: print("Finding the %i nearest neighbours..." % self.m, end="") # As usual, fit a nearest neighbour model to the data self.nearest_neighbour_.fit(self.x) if self.verbose: print("done!") # Now, get rid of noisy support vectors # Boolean array with True for noisy support vectors noise_bool = [] for x in support_vector: noise_bool.append(self.is_noise(x, self.y, self.minc, self.nearest_neighbour_)) # Turn into array# noise_bool = asarray(noise_bool) # Remove noisy support vectors support_vector = support_vector[np.logical_not(noise_bool)] # Find support_vectors there are in danger (interpolation) or not # (extrapolation) danger_bool = [self.in_danger(x, self.y, self.m, self.minc, self.nearest_neighbour_) for x in support_vector] # Turn into array# danger_bool = asarray(danger_bool) # Something ...# safety_bool = np.logical_not(danger_bool) if self.verbose: print("Out of {0} support vectors, {1} are noisy, " "{2} are in danger " "and {3} are safe.".format(support_vector.shape[0], noise_bool.sum().astype(int), danger_bool.sum().astype(int), safety_bool.sum().astype(int) ) ) # Proceed to find support vectors NNs among the minority class print("Finding the %i nearest neighbours..." % self.k, end="") self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(minx) if self.verbose: print("done!") print("Creating synthetic samples...", end="") # Split the number of synthetic samples between interpolation and # extrapolation # The fraction are sampled from a beta distribution with mean # 0.5 and variance 0.01# np.random.seed(self.rs) fractions = betavariate(alpha=10, beta=10) # Interpolate samples in danger nns = self.nearest_neighbour_.kneighbors(support_vector[danger_bool], return_distance=False)[:, 1:] sx1, sy1 = self.make_samples(support_vector[danger_bool], minx, self.minc, nns, fractions * (int(self.ratio * len(minx)) + 1), step_size=1, random_state=self.rs, verbose=self.verbose) # Extrapolate safe samples nns = self.nearest_neighbour_.kneighbors(support_vector[safety_bool], return_distance=False)[:, 1:] sx2, sy2 = self.make_samples(support_vector[safety_bool], minx, self.minc, nns, (1 - fractions) * int(self.ratio * len(minx)), step_size=-self.out_step, random_state=self.rs, verbose=self.verbose) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx1, sx2), axis=0) ret_y = concatenate((self.y, sy1, sy2), axis=0) return ret_x, ret_y
class KnnRecommender: def __init__(self, path_movies, path_ratings): self.path_movies = path_movies self.path_ratings = path_ratings self.movie_rating_thres = 0 self.user_rating_thres = 0 self.model = NearestNeighbors() def set_filter_params(self, movie_rating_thres, user_rating_thres): self.movie_rating_thres = movie_rating_thres self.user_rating_thres = user_rating_thres def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None): if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs }) def _prep_data(self): df_movies = pd.read_csv(os.path.join(self.path_movies), usecols=['movieId', 'title'], dtype={ 'movieId': 'int32', 'title': 'str' }) df_ratings = pd.read_csv(os.path.join(self.path_ratings), usecols=['userId', 'movieId', 'rating'], dtype={ 'userId': 'int32', 'movieId': 'int32', 'rating': 'float32' }) df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count']) popular_movies = list( set( df_movies_cnt.query( 'count >= @self.movie_rating_thres').index)) # noqa movies_filter = df_ratings.movieId.isin(popular_movies).values df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(), columns=['count']) active_users = list( set(df_users_cnt.query( 'count >= @self.user_rating_thres').index)) # noqa users_filter = df_ratings.userId.isin(active_users).values df_ratings_filtered = df_ratings[movies_filter & users_filter] movie_user_mat = df_ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0) print(movie_user_mat) hashmap = { movie: i for i, movie in enumerate( list( df_movies.set_index('movieId').loc[ movie_user_mat.index].title)) # noqa } print("hash->", hashmap) movie_user_mat_sparse = csr_matrix(movie_user_mat.values) del df_movies, df_movies_cnt, df_users_cnt del df_ratings, df_ratings_filtered, movie_user_mat gc.collect() return movie_user_mat_sparse, hashmap def _fuzzy_matching(self, hashmap, fav_movie): match_tuple = [] # get match for title, idx in hashmap.items(): ratio = fuzz.ratio(title.lower(), fav_movie.lower()) if ratio >= 60: match_tuple.append((title, idx, ratio)) # sort match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1] if not match_tuple: print('Oops! No match is found') else: print('Found possible matches in our database: ' '{0}\n'.format([x[0] for x in match_tuple])) return match_tuple[0][1] def _inference(self, model, data, hashmap, fav_movie, n_recommendations): # fit model.fit(data) # get input movie index print('You have input movie:', fav_movie) idx = self._fuzzy_matching(hashmap, fav_movie) print("-------->", idx, fav_movie) # inference print('Recommendation system start to make inference') print('......\n') t0 = time.time() distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations + 1) # get list of raw idx of recommendations raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key=lambda x: x[1] )[:0:-1] print('It took my system {:.2f}s to make inference \n\ '.format(time.time() - t0)) # return recommendation (movieId, distance) return raw_recommends def make_recommendations(self, fav_movie, n_recommendations): # get data movie_user_mat_sparse, hashmap = self._prep_data() # get recommendations raw_recommends = self._inference(self.model, movie_user_mat_sparse, hashmap, fav_movie, n_recommendations) # print results reverse_hashmap = {v: k for k, v in hashmap.items()} print('Recommendations for {}:'.format(fav_movie)) for i, (idx, dist) in enumerate(raw_recommends): print('{0}: {1}, with distance ' 'of {2}'.format(i + 1, reverse_hashmap[idx], dist))
class kNN: def __init__(self, dataObj, kNeighbors, distMetric='minkowski', p=2, metric_params=None, n_jobs=None): # Data object with training/testdata self.data = dataObj if kNeighbors == -1: self.useAllNeighbors = True else: self.useAllNeighbors = False kNeighbors = kNeighbors # Create kNN classifier self.nn = NearestNeighbors(n_neighbors=kNeighbors, metric=distMetric, p=p, metric_params=metric_params, n_jobs=n_jobs) def fit(self, featType, nSplits=3, randState=1010101, applyTransform=False, customData=[]): if featType == 'train': if customData == []: train, validation = self.data.getData('train') trainData = train[0] else: trainData = customData self.nn.fit(trainData) self.krank = self.nn.kneighbors(trainData)[1] return elif featType == 'test': query, gallery = self.data.getData('test') elif featType == 'validation': query, gallery = self.data.getData(featType) if self.useAllNeighbors: self.nn.set_params(n_neighbors=len(gallery[0])) if applyTransform: query[0] = matmul(query[0], self.U.T) gallery[0] = matmul(gallery[0], self.U.T) self.nn.fit(gallery[0]) self.krank = self.nn.kneighbors(query[0])[1] # for query, gallery in self.data.getData('test'): # self.nn.fit(gallery[0]) # # Save indices of k nearest neighbors # self.krank.append(self.nn.kneighbors(array(query[0]).reshape(1, -1))[1]) k = len(self.krank[0]) for i in range(len(self.krank)): qLab = query[1][i] qCam = query[2][i] labs = gallery[1][self.krank[i]] cams = gallery[2][self.krank[i]] correctPos = where( logical_not(logical_and(labs == qLab, cams == qCam))) self.krank[i] = append(self.krank[i][correctPos], array([-1] * (k - len(correctPos[0])))) maxEmpty = max(sum(self.krank == -1, axis=1)) self.krank = self.krank[:, 0:k - maxEmpty] # def fitModel(self, features, labels): # # Fit model. Features consists of rows of features # self.nn.fit(features, labels) def modParams(self, kNeighbors, distMetric, p='minkowski', metric_params=2, n_jobs=None): self.nn.set_params(n_neighbors=kNeighbors, metric=distMetric, p=p, metric_params=metric_params, n_jobs=n_jobs) def setTransform(self, A, isPickle=False): if isPickle: with open(A, 'rb') as f: A = pickle.load(f) self.U = cholesky(A, lower=False) def setTransMat(self, A, isPickle=False): if isPickle: with open(A, 'rb') as f: A = pickle.load(f) self.transMat = A def calcScore(self, rank, plot=False): self.rankAccs = zeros(rank) for i in range(len(self.krank)): for j in range(rank): matches = self.data.labelsGallery[ self.krank[i][0:j + 1]] == self.data.labelsQuery[i] positiveMatches = sum(matches) if positiveMatches > 0: self.rankAccs[j:rank] += 1 break self.rankAccs = self.rankAccs / len(self.krank) if plot: self.plotAccs(self.rankAccs) return self.rankAccs def calcMAP(self): self.rankMAp = 0 for i in range(len(self.krank)): self.rankMAp += self.calcAP(self.data.labelsQuery[i], self.data.labelsGallery[self.krank[i]]) self.rankMAp = self.rankMAp / len(self.krank) return self.rankMAp def plotAccs(self, rank): fig, axs = pyplot.subplots() axs.plot(rank) pyplot.show() def calcAP(self, trueLabel, neighbors): nNeighbors = len(neighbors) nMatches = sum(trueLabel == neighbors) precision = zeros(nNeighbors) recall = zeros(nNeighbors) if nMatches == 0: return 0 recallInc = 1 / nMatches #trueLabelArr = array([trueLabel]*nNeighbors) nPoints = 11 interp = zeros(nPoints) for i in range(len(precision)): precision[i] = mean(trueLabel == neighbors[0:i + 1]) if i == 0: recall[i] = recallInc * (trueLabel == neighbors[i]) else: recall[i] = recall[i - 1] if trueLabel == neighbors[i]: recall[i] = recall[i] + recallInc #recall = matmul((trueLabel==neighbors)*recallInc, tril(ones((len(neighbors), len(neighbors))))) recall = around(recall, 10) for i in range(nPoints): idx = min(where(i * 0.1 <= recall)[0]) interp[i] = max(precision[idx:len(precision)]) return mean(interp) def kernelFit(self, featType, kernel, applyTransform=False): self.krank = [] if featType == 'train': pass elif featType == 'test': query, gallery = self.data.getData('test') if applyTransform: query[0] = matmul(query[0], self.U.T) gallery[0] = matmul(gallery[0], self.U.T) for i in range(len(query[0])): dist = kernel.transform(query[0][i].reshape(1, -1), gallery[0]) idx = argsort(dist, axis=0) #idx = argsort(dist, axis=0)[::-1] qLab = query[1][i] qCam = query[2][i] labs = gallery[1][idx] cams = gallery[2][idx] correctPos = where( logical_not(logical_and(labs == qLab, cams == qCam)).flatten()) self.krank.append( append( idx[correctPos], array([-1] * (len(gallery[0]) - len(correctPos[0]))))) self.krank = array(self.krank) maxEmpty = max(sum(self.krank == -1, axis=1)) self.krank = self.krank[:, 0:len(gallery[0]) - maxEmpty]
class KnnRecommender: """ This is an item-based collaborative filtering recommender with KNN implmented by sklearn """ def __init__(self, sparse_matrix_file, mapper_file): """ Recommender requires path to data: movies data and ratings data Parameters ---------- path_movies: str, movies data file path path_ratings: str, ratings data file path """ self.sparse_matrix_file = sparse_matrix_file self.mapper_file = mapper_file self.model = NearestNeighbors() def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None): """ set model params for sklearn.neighbors.NearestNeighbors Parameters ---------- n_neighbors: int, optional (default = 5) algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional metric: string or callable, default 'minkowski', or one of ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] n_jobs: int or None, optional (default=None) """ if n_jobs and (n_jobs > 1 or n_jobs == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params(**{ 'n_neighbors': n_neighbors, 'algorithm': algorithm, 'metric': metric, 'n_jobs': n_jobs}) def _prep_data(self): game_user_mat_sparse = scipy.sparse.load_npz(os.path.join(self.sparse_matrix_file)) with open(os.path.join(self.mapper_file), 'rb') as fp: hashmap = pickle.load(fp) return game_user_mat_sparse, hashmap def _fuzzy_matching(self, hashmap, fav_movie): """ return the closest match via fuzzy ratio. If no match found, return None Parameters ---------- hashmap: dict, map movie title name to index of the movie in data fav_movie: str, name of user input movie Return ------ index of the closest match """ match_tuple = [] # get match for name, idx in hashmap.items(): ratio = fuzz.ratio(name.lower(), game_name.lower()) if ratio >= 60: match_tuple.append((name, idx, ratio)) # sort match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1] if not match_tuple: print('Oops! No match is found') else: print('Found possible matches in our database: ' '{0}\n'.format([x[0] for x in match_tuple])) return match_tuple[0][1] def _inference(self, model, data, hashmap, game_name, n_recommendations): """ return top n similar movie recommendations based on user's input movie Parameters ---------- model: sklearn model, knn model data: movie-user matrix hashmap: dict, map movie title name to index of the movie in data fav_movie: str, name of user input movie n_recommendations: int, top n recommendations Return ------ list of top n similar movie recommendations """ # fit model.fit(data) # get input movie index print('You have input game:', game_name) idx = self._fuzzy_matching(hashmap, game_name) # inference print('Recommendation system start to make inference') print('......\n') t0 = time.time() distances, indices = model.kneighbors( data[idx], n_neighbors=n_recommendations+1) # get list of raw idx of recommendations raw_recommends = \ sorted( list( zip( indices.squeeze().tolist(), distances.squeeze().tolist() ) ), key=lambda x: x[1] )[:0:-1] print('It took my system {:.2f}s to make inference \n\ '.format(time.time() - t0)) # return recommendation (movieId, distance) return raw_recommends def make_recommendations(self, game_name, n_recommendations): """ make top n movie recommendations Parameters ---------- fav_movie: str, name of user input movie n_recommendations: int, top n recommendations """ # get data game_user_mat_sparse, hashmap = self._prep_data() # get recommendations raw_recommends = self._inference( self.model, game_user_mat_sparse, hashmap, game_name, n_recommendations) # print results reverse_hashmap = {v: k for k, v in hashmap.items()} print('Recommendations for {}:'.format(game_name)) for i, (idx, dist) in enumerate(raw_recommends): print('{0}: {1}'.format(i+1, reverse_hashmap[idx]))
class recommender_knn: # Item-based CF recommender class with KNN implmented by sklearn def __init__(self, path_test, path_rat, mov_rat_thres, use_rat_thres, n_nei, alg, met, job=None): self.path_test = path_test # path to movies data self.path_rat = path_rat # path to ratings data self.mov_rat_thres = mov_rat_thres self.use_rat_thres = use_rat_thres self.k_nn = n_nei # get the NearestNeighbors model self.model = NearestNeighbors() # n_neighbors: int, # algorithm: brute, # metric: (default 'minkowski') = 'cosine', # n_jobs: int or None if job and (job > 1 or job == -1): os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' self.model.set_params( **{ 'n_neighbors': self.k_nn, 'algorithm': alg, 'metric': met, 'n_jobs': job }) def preprocess_data(self): # - set up movie-user matrix # read data from a .dat file and save as DataFrame - pandas df_test = pd.read_table(os.path.join(self.path_test), delimiter=' ', usecols=['userID', 'movieID'], dtype={ 'userID': 'int32', 'movieID': 'int32' }) df_rat = pd.read_table(os.path.join(self.path_rat), delimiter=' ', usecols=['userID', 'movieID', 'rating'], dtype={ 'userID': 'int32', 'movieID': 'int32', 'rating': 'float32' }) # filter data df_mov_coun = pd.DataFrame(df_rat.groupby('movieID').size(), columns=['count']) # get a list of movies which received enough rating pop_mov = list( set(df_mov_coun.query( 'count >= @self.mov_rat_thres').index)) # noqa mov_fil = df_rat.movieID.isin(pop_mov).values # print out all elements of a pandas DataFrame pd.set_option('display.max_columns', None) # print out all element of an array np.set_printoptions(threshold=np.inf) # just consider the users who rated more than the film number threshold df_use_coun = pd.DataFrame(df_rat.groupby('userID').size(), columns=['count']) act_use = list( set(df_use_coun.query('count >= @self.use_rat_thres').index)) use_fil = df_rat.userID.isin(act_use).values # erase the unactive user and unpopular movies df_rat_fil = df_rat[mov_fil & use_fil] # create movie-user matrix by pivot function mov_use_mat = df_rat_fil.pivot(index='movieID', columns='userID', values='rating').fillna(0) # transform to a sparse matrix mov_use_spa_mat = csr_matrix(mov_use_mat.values) # clean up del df_rat, df_rat_fil del df_mov_coun, df_use_coun return mov_use_spa_mat, mov_use_mat, df_test def _rating(self, model, data, sparse_data, rating_movies, n_recom): # return top n similar movie recommendations based on user's input movie # model: sklearn model, knn model, data: movie-user matrix, sparse_data = sparse matrix # guess_movies: list of movies need to be rated # n_recom: top n recommendations # Return: # list of top n similar movie recommendations # fit model.fit(sparse_data) # get input movie index print('You have a list of movies needed to be rated') t0 = time.time() print("The rating score for movies and user:") # run for all data in the test file score_list = [] for i in range(len(rating_movies)): print(i) movieID = rating_movies.loc[i][1] userID = rating_movies.loc[i][0] # get the row ordered number as knowing the index idx = next(iter(np.where(data.index == movieID)[0]), 'not matched') if (idx == 'not matched'): score = 0.0 score_list.append(score) else: # the first item is the point itself distances, indices = model.kneighbors(sparse_data[idx], n_neighbors=self.k_nn + 1) # calculate the score for the item: count = 0 mean_list = np.empty( self.k_nn) # rating mean of all neighbor items mean_rate_item = 0.0 # rating mean of the current rating item user_item_rate = np.empty( self.k_nn) # mean of the similarity x (the difference of ) nomin = np.empty(self.k_nn) denom = np.empty(self.k_nn) # for loop to calculate the elements for the scoring formula for index in indices[0]: if count > 0: mean_list[count - 1] = sparse_data.mean( axis=1)[index].squeeze().squeeze() user_item_rate[count - 1] = data.loc[ data.iloc[[index]].index[0], userID] # calculate the denominator denom[count - 1] = distances.squeeze()[count] # calculate the nominator item nomin[count - 1] = distances.squeeze()[count] * ( user_item_rate[count - 1] - mean_list[count - 1]) else: mean_rate_item = sparse_data.mean( axis=1)[index].squeeze().squeeze() count += 1 # calculate the score score1 = nomin.sum() / denom.sum() + mean_rate_item[0, 0] if score1 < 0: score_list.append(0) elif score1 > 5: score_list.append(score1) else: score_list.append(score1) print('It took {:.2f}s to make inference \n\ '.format(time.time() - t0)) return score_list def make_predictions(self): # make n movie recommendations # guess_movie: list of movies which need to be rated, n_recom: n recommendations # get data mov_use_spa_mat, data, movies_list_rating = self.preprocess_data() # get recommendations score_list = self._rating(self.model, data, mov_use_spa_mat, movies_list_rating, self.k_nn) return score_list