def resample(self):
        from sklearn.neighbors import NearestNeighbors

        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Find the NNs for all samples in the data set.
        print("Finding the %i nearest neighbours..." % self.m, end = "")
        NN = NearestNeighbors(n_neighbors = self.m + 1)
        NN.fit(self.x)

        print("done!")

        # Boolean array with True for minority samples in danger
        index = asarray([in_danger(x, self.y, self.m, miny[0], NN) for x in minx])

        # If all minority samples are safe, return the original data set.
        if not any(index):
            print('There are no samples in danger. No borderline synthetic samples created.')
            return self.x, self.y

        # Find the NNs among the minority class
        NN.set_params(**{'n_neighbors' : self.k + 1})
        NN.fit(minx)
        nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:]

        # Create synthetic samples for borderline points.
        sx, sy = make_samples(minx[index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs)

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis = 0)
        ret_y = concatenate((self.y, sy), axis = 0)

        return ret_x, ret_y
Example #2
0
    def resample(self):
        from sklearn.neighbors import NearestNeighbors

        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Find the NNs for all samples in the data set.
        print("Finding the %i nearest neighbours..." % self.m, end = "")
        NN = NearestNeighbors(n_neighbors = self.m + 1)
        NN.fit(self.x)

        print("done!")

        # Boolean array with True for minority samples in danger
        index = asarray([in_danger(x, self.y, self.m, miny[0], NN) for x in minx])

        # If all minority samples are safe, return the original data set.
        if not any(index):
            print('There are no samples in danger. No borderline synthetic samples created.')
            return self.x, self.y

        # Find the NNs among the minority class
        NN.set_params(**{'n_neighbors' : self.k + 1})
        NN.fit(minx)
        nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:]

        # Create synthetic samples for borderline points.
        sx, sy = make_samples(minx[index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs)

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis = 0)
        ret_y = concatenate((self.y, sy), axis = 0)

        return ret_x, ret_y
def _compute_mi_cc(x, y, n_neighbors):
    """Compute mutual information between two continuous variables.

    I lifted this from SKLEARN

    Parameters
    ----------
    x, y : ndarray, shape (n_samples,)
        Samples of two continuous random variables, must have an identical
        shape.

    n_neighbors : int
        Number of nearest neighbors to search for each point, see [1]_.

    Returns
    -------
    mi : float
        Estimated mutual information. If it turned out to be negative it is
        replace by 0.

    Notes
    -----
    True mutual information can't be negative. If its estimate by a numerical
    method is negative, it means (providing the method is adequate) that the
    mutual information is close to 0 and replacing it by 0 is a reasonable
    strategy. """

    n_samples = x.size

    x = x.reshape((-1, 1))
    y = y.reshape((-1, 1))
    xy = np.hstack((x, y))

    # Here we rely on NearestNeighbors to select the fastest algorithm.
    nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors)

    nn.fit(xy)
    radius = nn.kneighbors()[0]
    radius = np.nextafter(radius[:, -1], 0)

    # Algorithm is selected explicitly to allow passing an array as radius
    # later (not all algorithms support this).
    nn.set_params(algorithm='kd_tree')

    nn.fit(x)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    nx = np.array([i.size for i in ind])

    nn.fit(y)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    ny = np.array([i.size for i in ind])

    mi = (digamma(n_samples) + digamma(n_neighbors) -
          np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)))

    return max(0, mi)
Example #4
0
def model_param(n_neighbors, algorithm, metric, n_jobs=None):
    model = NearestNeighbors()
    if n_jobs and (n_jobs > 1 or n_jobs == -1):
        os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
    model.set_params(
        **{
            'n_neighbors': n_neighbors,
            'algorithm': algorithm,
            'metric': metric,
            'n_jobs': n_jobs
        })
    return model
Example #5
0
    def resample(self):
        from sklearn.neighbors import NearestNeighbors

        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Find the NNs for all samples in the data set.
        print("Finding the %i nearest neighbours..." % self.m, end="")
        NN = NearestNeighbors(n_neighbors=self.m + 1)
        NN.fit(self.x)

        print("done!")

        # Boolean array with True for minority samples in danger
        index = asarray(
            [in_danger(x, self.y, self.m, self.minc, NN) for x in minx])

        # If all minority samples are safe, return the original data set.
        if not any(index):
            print(
                'There are no samples in danger. No borderline synthetic samples created.'
            )
            return self.x, self.y

        # Find the NNs among the minority class
        NN.set_params(**{'n_neighbors': self.k + 1})
        NN.fit(minx)
        nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:]

        # Split the number of synthetic samples between only minority (type 1), or
        # minority and majority (with reduced step size) (type 2.
        Pyseed(self.rs)
        fractions = min(max(gauss(0.5, 0.1), 0), 1)

        # Only minority
        sx1, sy1 = make_samples(minx[index], minx, self.minc, nns,\
                                fractions * (int(self.ratio * len(miny)) + 1),\
                                step_size=1,\
                                random_state=self.rs)

        # Only majority with smaller step size
        sx2, sy2 = make_samples(minx[index], self.x[self.y != self.minc], self.minc, nns,\
                                (1 - fractions) * int(self.ratio * len(miny)),\
                                step_size=0.5,\
                                random_state=self.rs)

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx1, sx2), axis=0)
        ret_y = concatenate((self.y, sy1, sy2), axis=0)

        return ret_x, ret_y
Example #6
0
    def load():
        #API calls
        request = requests.get(
            'http://api.beerless.be/api/tastingprofiles/averages?access_token=smCLVeBjK79ywuPJFRI599qiu1JFFgKVJrVCq9mtzV0Nus5j5IYB9B8B9uthSTc6'
        )
        x = request.json()
        test = pd.DataFrame(x)
        df_tastingprofiles = test[[
            'beerId', 'malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'
        ]]

        # pivot and create tastingprofile matrix
        df_tastingprofile_features = df_tastingprofiles.set_index('beerId')

        #Configuring Google Cloud storage
        client = storage.Client()
        bucket = client.get_bucket("beerless-scripts-1.appspot.com")
        beerIDPickle = bucket.blob("beerID.pickle")

        # Upload pickle dump
        beerIDPickle.upload_from_string(
            pickle.dumps(df_tastingprofiles, protocol=pickle.HIGHEST_PROTOCOL))

        #Creating matrix
        mat_tastingprofile_features = csr_matrix(
            df_tastingprofile_features.values)

        #Saving data
        dataPickle = bucket.blob("data.pickle")
        dataPickle.upload_from_string(
            pickle.dumps(mat_tastingprofile_features,
                         protocol=pickle.HIGHEST_PROTOCOL))

        #creating models
        model = NearestNeighbors()

        #adding parameters to model
        model.set_params(n_neighbors=20,
                         algorithm='brute',
                         metric='cosine',
                         n_jobs=-1)

        # fit
        model.fit(mat_tastingprofile_features)

        #saving model to file
        modelPickle = bucket.blob("model.pickle")
        modelPickle.upload_from_string(
            pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL))

        # clean up
        del df_tastingprofiles, df_tastingprofile_features
    def resample(self):
        from sklearn.neighbors import NearestNeighbors

        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Find the NNs for all samples in the data set.
        print("Finding the %i nearest neighbours..." % self.m, end = "")
        NN = NearestNeighbors(n_neighbors = self.m + 1)
        NN.fit(self.x)

        print("done!")

        # Boolean array with True for minority samples in danger
        index = asarray([in_danger(x, self.y, self.m, self.minc, NN) for x in minx])

        # If all minority samples are safe, return the original data set.
        if not any(index):
            print('There are no samples in danger. No borderline synthetic samples created.')
            return self.x, self.y

        # Find the NNs among the minority class
        NN.set_params(**{'n_neighbors' : self.k + 1})
        NN.fit(minx)
        nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:]


        # Split the number of synthetic samples between only minority (type 1), or
        # minority and majority (with reduced step size) (type 2.
        Pyseed(self.rs)
        fractions = min(max(gauss(0.5, 0.1), 0), 1)

        # Only minority
        sx1, sy1 = make_samples(minx[index], minx, self.minc, nns,\
                                fractions * (int(self.ratio * len(miny)) + 1),\
                                step_size=1,\
                                random_state=self.rs)

        # Only majority with smaller step size
        sx2, sy2 = make_samples(minx[index], self.x[self.y != self.minc], self.minc, nns,\
                                (1 - fractions) * int(self.ratio * len(miny)),\
                                step_size=0.5,\
                                random_state=self.rs)

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx1, sx2), axis = 0)
        ret_y = concatenate((self.y, sy1, sy2), axis = 0)

        return ret_x, ret_y
def runprogram():
    model = NearestNeighbors()
    model.set_params(**{
        'n_neighbors': 20,
        'algorithm': 'brute',
        'metric': 'cosine',
        'n_jobs': -1
    })
    p = str(input())
    while (p) != '-1':
        P_U_SP, H_MAP = preprocess()
        ans = recSystem(p, model, P_U_SP, H_MAP)
        for s in ans:
            print(s)
            sleep(0.6)
        print('\n')
        p = str(input())
Example #9
0
 def _instantiate_nearest_neighbors_object(self):
     backend = self.knn_backend
     if backend == "sklearn":
         backend_instance = NearestNeighbors(algorithm="auto")
     elif callable(backend):
         backend_instance = backend()
         self.metric = backend_instance.metric
     elif hasattr(backend, "fit") and hasattr(backend, "kneighbors"):
         backend_instance = sk_clone(backend)
         self.metric = backend_instance.metric
     else:
         raise NotImplementedError(
             "`knn_backend` must be either an NearestNeighbors-like object,"
             " a callable returning such an object, or the string \"sklearn\""
         )
     backend_instance.set_params(**self._get_metric_dict())
     return backend_instance
Example #10
0
def networkBuildKnn(X_net, Y_net, knn, labels=False):
    g = nx.Graph()
    lnNet = len(X_net)
    g.graph["lnNet"] = lnNet
    g.graph["classNames"] = list(set(Y_net))
    for index, instance in enumerate(X_net):
        g.add_node(str(index),
                   value=instance,
                   typeNode='net',
                   label=Y_net[index])
    values = X_net

    if (isinstance(values[0], (int, float, str))):
        values = [e[0] for e in values]

    nbrs = NearestNeighbors(knn + 1, metric='euclidean')
    nbrs.fit(values)

    distances, indices = nbrs.kneighbors(values)
    indices = indices[:, 1:]
    distances = distances[:, 1:]
    eRadius = np.quantile(distances, 0.5)
    nbrs.set_params(radius=eRadius)

    for indiceNode, indicesNode in enumerate(indices):
        for tmpi, indice in enumerate(indicesNode):
            if (g.nodes()[str(indice)]["label"]
                    == g.nodes()[str(indiceNode)]["label"] or not labels):
                g.add_edge(str(indice),
                           str(indiceNode),
                           weight=distances[indiceNode][tmpi])

    distances, indices = nbrs.radius_neighbors([instance])
    for indiceNode, indicesNode in enumerate(indices):
        for tmpi, indice in enumerate(indicesNode):
            if (not str(indice) == str(indiceNode)):
                if (g.nodes()[str(indice)]["label"]
                        == g.nodes()[str(indiceNode)]["label"] or not labels):
                    g.add_edge(str(indice),
                               str(indiceNode),
                               weight=distances[indiceNode][tmpi])
    g.graph["index"] = lnNet
    return g, nbrs
def mi_3(x, y, z, n_neighbors):
    """Compute mutual information between three continuous variables.

    I lifted this from SKLEARN """

    n_samples = x.size

    x = x.reshape((-1, 1))
    y = y.reshape((-1, 1))
    z = y.reshape((-1, 1))
    xyz = np.hstack((x, y, z))

    # Here we rely on NearestNeighbors to select the fastest algorithm.
    nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors)

    nn.fit(xyz)
    radius = nn.kneighbors()[0]
    radius = np.nextafter(radius[:, -1], 0)

    # Algorithm is selected explicitly to allow passing an array as radius
    # later (not all algorithms support this).
    nn.set_params(algorithm='kd_tree')

    nn.fit(x)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    nx = np.array([i.size for i in ind])

    nn.fit(y)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    ny = np.array([i.size for i in ind])

    nn.fit(z)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    nz = np.array([i.size for i in ind])

    mi = (digamma(n_samples) + digamma(n_neighbors) -
          np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)) -
          np.mean(digamma(nz + 1)))

    return max(0, mi)
def _compute_mi_cd(c, d, n_neighbors):

    n_samples = c.shape[0]
    c = c.reshape((-1, 1))

    radius = np.empty(n_samples)
    label_counts = np.empty(n_samples)
    k_all = np.empty(n_samples)
    nn = NearestNeighbors()
    for label in np.unique(d):
        mask = d == label
        count = np.sum(mask)
        if count > 1:
            k = min(n_neighbors, count - 1)
            nn.set_params(n_neighbors=k)
            nn.fit(c[mask])
            r = nn.kneighbors()[0]
            radius[mask] = np.nextafter(r[:, -1], 0)
            k_all[mask] = k
        label_counts[mask] = count

    # Ignore points with unique labels.
    mask = label_counts > 1
    n_samples = np.sum(mask)
    label_counts = label_counts[mask]
    k_all = k_all[mask]
    c = c[mask]
    radius = radius[mask]

    nn.set_params(algorithm='kd_tree')
    nn.fit(c)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    m_all = np.array([i.size for i in ind])

    mi = (digamma(n_samples) + np.mean(digamma(k_all)) -
          np.mean(digamma(label_counts)) - np.mean(digamma(m_all + 1)))

    return max(0, mi)
    def resample(self):
        from sklearn.svm import SVC
        from sklearn.neighbors import NearestNeighbors

        svc = SVC()
        svc.set_params(**self.svm_args)

        # Fit SVM and find the support vectors
        svc.fit(self.x, self.y)
        support_index = svc.support_[self.y[svc.support_] == self.minc]
        support_vetor = self.x[support_index]

        # Start with the minority class
        minx = self.x[self.y == self.minc]

        # First, find the NN of all the samples to identify samples in danger and noisy ones
        print("Finding the %i nearest neighbours..." % self.m, end = "")
        NN = NearestNeighbors(n_neighbors = self.m + 1)
        NN.fit(self.x)
        print("done!")

        # Now, get rid of noisy support vectors

        # Boolean array with True for noisy support vectors
        noise_bool = asarray([is_noise(x, self.y, self.m, self.minc, NN) for x in support_vetor])

        # Remove noisy support vectors
        support_vetor = support_vetor[logical_not(noise_bool)]

        # Find support_vectors there are in danger (interpolation) or not (extrapolation)
        danger_bool = asarray([in_danger(x, self.y, self.m, self.minc, NN) for x in support_vetor])
        safety_bool = logical_not(danger_bool)


        print_stats = (len(support_vetor), nsum(noise_bool), nsum(danger_bool), nsum(safety_bool))
        print("Out of %i support vectors, %i are noisy, %i are in danger and %i are safe." % print_stats)

        # Proceed to find support vectors NNs among the minority class
        print("Finding the %i nearest neighbours..." % self.k, end = "")
        NN.set_params(**{'n_neighbors' : self.k + 1})
        NN.fit(minx)
        print("done!")


        print("Creating synthetic samples...", end = "")
        # Split the number of synthetic samples between interpolation and extrapolation
        Pyseed(self.rs)
        fractions = min(max(gauss(0.5, 0.1), 0), 1)

        # Interpolate samples in danger
        nns = NN.kneighbors(support_vetor[danger_bool], return_distance=False)[:, 1:]

        sx1, sy1 = make_samples(support_vetor[danger_bool], minx, self.minc, nns,\
                                fractions * (int(self.ratio * len(minx)) + 1),\
                                step_size=1,\
                                random_state=self.rs)

        # Extrapolate safe samples
        nns = NN.kneighbors(support_vetor[safety_bool], return_distance=False)[:, 1:]

        sx2, sy2 = make_samples(support_vetor[safety_bool], minx, self.minc, nns,\
                                (1 - fractions) * int(self.ratio * len(minx)),\
                                step_size=-self.out_step,\
                                random_state=self.rs)

        print("done!")

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx1, sx2), axis=0)
        ret_y = concatenate((self.y, sy1, sy2), axis=0)

        return ret_x, ret_y
class KnnRecommender:
    """
    This is an item-based collaborative filtering recommender with
    KNN implmented by sklearn
    """
    def __init__(self, path_movies, path_ratings):
        """
        Recommender requires path to data: movies data and ratings data

        Parameters
        ----------
        path_movies: str, movies data file path

        path_ratings: str, ratings data file path
        """
        self.path_movies = path_movies
        self.path_ratings = path_ratings
        self.movie_rating_thres = 0
        self.user_rating_thres = 0
        self.model = NearestNeighbors()

    def set_filter_params(self, movie_rating_thres, user_rating_thres):
        """
        set rating frequency threshold to filter less-known movies and
        less active users

        Parameters
        ----------
        movie_rating_thres: int, minimum number of ratings received by users

        user_rating_thres: int, minimum number of ratings a user gives
        """
        self.movie_rating_thres = movie_rating_thres
        self.user_rating_thres = user_rating_thres

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
        """
        set model params for sklearn.neighbors.NearestNeighbors

        Parameters
        ----------
        n_neighbors: int, optional (default = 5)

        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional

        metric: string or callable, default 'minkowski', or one of
            ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']

        n_jobs: int or None, optional (default=None)
        """
        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(
            **{
                'n_neighbors': n_neighbors,
                'algorithm': algorithm,
                'metric': metric,
                'n_jobs': n_jobs
            })

    def _prep_data(self):
        """
        prepare data for recommender

        1. movie-user scipy sparse matrix
        2. hashmap of movie to row index in movie-user scipy sparse matrix
        """
        # read data
        df_movies = pd.read_csv(os.path.join(self.path_movies),
                                usecols=['movieId', 'title'],
                                dtype={
                                    'movieId': 'int32',
                                    'title': 'str'
                                })
        df_ratings = pd.read_csv(os.path.join(self.path_ratings),
                                 usecols=['userId', 'movieId', 'rating'],
                                 dtype={
                                     'userId': 'int32',
                                     'movieId': 'int32',
                                     'rating': 'float32'
                                 })
        # filter data
        df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(),
                                     columns=['count'])
        popular_movies = list(
            set(
                df_movies_cnt.query(
                    'count >= @self.movie_rating_thres').index))  # noqa
        movies_filter = df_ratings.movieId.isin(popular_movies).values

        df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(),
                                    columns=['count'])
        active_users = list(
            set(df_users_cnt.query(
                'count >= @self.user_rating_thres').index))  # noqa
        users_filter = df_ratings.userId.isin(active_users).values

        df_ratings_filtered = df_ratings[movies_filter & users_filter]

        # pivot and create movie-user matrix
        movie_user_mat = df_ratings_filtered.pivot(index='movieId',
                                                   columns='userId',
                                                   values='rating').fillna(0)
        # create mapper from movie title to index
        hashmap = {
            movie: i
            for i, movie in enumerate(
                list(
                    df_movies.set_index('movieId').loc[
                        movie_user_mat.index].title))  # noqa
        }
        # transform matrix to scipy sparse matrix
        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

        # clean up
        del df_movies, df_movies_cnt, df_users_cnt
        del df_ratings, df_ratings_filtered, movie_user_mat
        gc.collect()
        return movie_user_mat_sparse, hashmap

    # 查找要搜索的电影是否在检索库中
    def _fuzzy_matching(self, hashmap, fav_movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None

        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data

        fav_movie: str, name of user input movie

        Return
        ------
        index of the closest match
        """
        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap, fav_movie, n_recommendations):
        """
        return top n similar movie recommendations based on user's input movie

        Parameters
        ----------
        model: sklearn model, knn model

        data: movie-user matrix

        hashmap: dict, map movie title name to index of the movie in data

        fav_movie: str, name of user input movie

        n_recommendations: int, top n recommendations

        Return
        ------
        list of top n similar movie recommendations
        """
        # fit
        model.fit(data)
        # get input movie index
        print('You have input movie:', fav_movie)
        idx = self._fuzzy_matching(hashmap, fav_movie)
        # inference
        print('Recommendation system start to make inference')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(data[idx],
                                              n_neighbors=n_recommendations +
                                              1)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        print('It took my system {:.2f}s to make inference \n\
              '.format(time.time() - t0))
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, fav_movie, n_recommendations):
        """
        make top n movie recommendations

        Parameters
        ----------
        fav_movie: str, name of user input movie

        n_recommendations: int, top n recommendations
        """
        # get data
        movie_user_mat_sparse, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(self.model, movie_user_mat_sparse,
                                         hashmap, fav_movie, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance '
                  'of {2}'.format(i + 1, reverse_hashmap[idx], dist))
Example #15
0
def _compute_mi_cd(c, d, n_neighbors):
    """Compute mutual information between continuous and discrete variables.

    Parameters
    ----------
    c : ndarray, shape (n_samples,)
        Samples of a continuous random variable.

    d : ndarray, shape (n_samples,)
        Samples of a discrete random variable.

    n_neighbors : int
        Number of nearest neighbors to search for each point, see [1]_.

    Returns
    -------
    mi : float
        Estimated mutual information. If it turned out to be negative it is
        replace by 0.

    Notes
    -----
    True mutual information can't be negative. If its estimate by a numerical
    method is negative, it means (providing the method is adequate) that the
    mutual information is close to 0 and replacing it by 0 is a reasonable
    strategy.

    References
    ----------
    .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
       Data Sets". PLoS ONE 9(2), 2014.
    """
    n_samples = c.shape[0]
    if len(c.shape) == 1:
        c = c.reshape([-1, 1])
    radius = np.empty(n_samples)
    label_counts = np.empty(n_samples)
    k_all = np.empty(n_samples)
    nn = NearestNeighbors()

    for label in np.unique(d, axis=0):
        mask = np.all(d == label, axis=-1)
        count = np.sum(mask)
        if count > 1:
            k = min(n_neighbors, count - 1)
            nn.set_params(n_neighbors=k)
            nn.fit(c[mask])
            r = nn.kneighbors()[0]
            radius[mask] = np.nextafter(r[:, -1], 0)
            k_all[mask] = k
        label_counts[mask] = count

    # Ignore points with unique labels.
    mask = label_counts > 1
    n_samples = np.sum(mask)
    label_counts = label_counts[mask]
    k_all = k_all[mask]
    c = c[mask]
    radius = radius[mask]

    nn.set_params(algorithm='kd_tree')
    nn.fit(c)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    m_all = np.array([i.size for i in ind])

    mi = (digamma(n_samples) + np.mean(digamma(k_all)) -
          np.mean(digamma(label_counts)) - np.mean(digamma(m_all + 1)))
    return max(0, mi)
def make_inferece(data, hashmap, target_title, n_recommendations, n_neighbors,
                  algorithm, metric, n_jobs):
    """
        return top n similar items recommendations based on user's input item
        Parameters
        ----------
        model: sklearn model, knn model
        data: item-user matrix
        hashmap: dict, map item title name to index of the item in data
        fav_title: str, name of user input item
        n_recommendations: int, top n recommendations
        Return
        ------
        list of top n similar item recommendations
    """
    model = NearestNeighbors()
    """
        set model params for sklearn.neighbors.NearestNeighbors
        Parameters
        ----------
        n_neighbors: int, optional (default = 5)
        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        metric: string or callable, default 'minkowski', or one of
            ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
        n_jobs: int or None, optional (default=None)
    """

    model.set_params(
        **{
            'n_neighbors': n_neighbors,
            'algorithm': algorithm,
            'metric': metric,
            'n_jobs': n_jobs
        })

    model.fit(data)

    idx = fuzzy_matching(hashmap, target_title)

    # Inference
    #print('Recommendation system start to make inference')
    #print('......\n')

    t0 = time.time()
    distances, indices = model.kneighbors(data[idx],
                                          n_neighbors=n_recommendations + 1)
    raw_recommends = \
        sorted(
            list(
                zip(
                    indices.squeeze().tolist(),
                    distances.squeeze().tolist()
                )
            ),
            key = lambda x: x[1]
        )[:0:-1]
    #print('It took my system {:.2f}s to make inference \n'.format(time.time() - t0))

    # return recommendation (movieId, distance)

    return raw_recommends
Example #17
0
class KnnRecommender:
    """
    This is an item-based collaborative filtering recommender with
    KNN implmented by sklearn
    """
    def __init__(self):
        """
        Recommender requires path to data: movies data and movie_ratings data
        """

        self.model = NearestNeighbors()
        self.model.set_params(
            **{
                'n_neighbors': 20,
                'algorithm': 'brute',
                'metric': 'cosine',
                'n_jobs': -1
            })
        self.data = MovieRatingData()
        data = self.data.movie_user_mat_sparse
        self.model.fit(data)

    def _fuzzy_matching(self, movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None
        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        Return
        ------
        index of the closest match
        """
        match_tuples = []
        # get match
        for title, idx in self.data.movies_to_csr_indices.items():
            ratio = fuzz.ratio(title.lower(), movie.lower())
            if ratio >= 60:
                match_tuples.append((title, idx, ratio))
        # sort
        return None if not match_tuples else sorted(
            match_tuples, key=itemgetter(2), reverse=True)[0][1]

    def _inference(self, movie):
        """
        return top n similar movie recommendations based on user's input movie
        Parameters
        ----------
        model: sklearn model, knn model
        movie: str, name of user input movie
        Return
        ------
        list of top n similar movie recommendations
        """
        data = self.data.movie_user_mat_sparse
        movie_idx = self._fuzzy_matching(movie)

        distances, indices = self.model.kneighbors(data[movie_idx],
                                                   n_neighbors=6)
        distances, indices = distances.squeeze().tolist(), indices.squeeze(
        ).tolist()
        raw_recommends = sorted(list(zip(indices, distances)),
                                key=itemgetter(1))[:0:-1]

        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, movie):
        """
        make top n movie recommendations
        Parameters
        ----------
        movie: str, name of user input movie
        n_recommendations: int, top n recommendations
        """
        # get recommendations
        raw_recommends = self._inference(movie)
        indices_to_movies = {
            v: k
            for k, v in self.data.movies_to_csr_indices.items()
        }
        movie_names = [indices_to_movies[i[0]] for i in raw_recommends]
        return movie_names
Example #18
0
class SMOTE(OverSampler):
    """Class to perform over-sampling using SMOTE.

    This object is an implementation of SMOTE - Synthetic Minority
    Over-sampling Technique, and the variants Borderline SMOTE 1, 2 and
    SVM-SMOTE.

    Parameters
    ----------
    ratio : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balance
        the dataset. Otherwise, the ratio is defined as the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int or None, optional (default=None)
        Seed for random number generation.

    verbose : bool, optional (default=True)
        Whether or not to print information about the processing.

    k : int, optional (default=5)
        Number of nearest neighbours to used to construct synthetic samples.

    m : int, optional (default=10)
        Number of nearest neighbours to use to determine if a minority sample
        is in danger.

    out_step : float, optional (default=0.5)
        Step size when extrapolating.

    kind : str, optional (default='regular')
        The type of SMOTE algorithm to use one of the following options:
        'regular', 'borderline1', 'borderline2', 'svm'.

    Attributes
    ----------
    ratio : str or float
        If 'auto', the ratio will be defined automatically to balance
        the dataset. Otherwise, the ratio is defined as the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int or None
        Seed for random number generation.

    min_c_ : str or int
        The identifier of the minority class.

    max_c_ : str or int
        The identifier of the majority class.

    stats_c_ : dict of str/int : int
        A dictionary in which the number of occurences of each class is
        reported.

    X_shape_ : tuple of int
        Shape of the data `X` during fitting.

    Notes
    -----
    See the original papers: [1]_, [2]_, [3]_ for more details.

    It does not support multiple classes automatically, but can be called
    multiple times.

    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
       synthetic minority over-sampling technique," Journal of artificial
       intelligence research, 321-357, 2002.

    .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
       over-sampling method in imbalanced data sets learning," Advances in
       intelligent computing, 878-887, 2005.

    .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for
       imbalanced data classification," International Journal of Knowledge
       Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001.

    """
    def __init__(self,
                 ratio='auto',
                 random_state=None,
                 verbose=True,
                 k=5,
                 m=10,
                 out_step=0.5,
                 kind='regular',
                 n_jobs=-1,
                 **kwargs):
        """Initialisation of SMOTE object.

        Parameters
        ----------
        ratio : str or float, optional (default='auto')
            If 'auto', the ratio will be defined automatically to balance
            the dataset. Otherwise, the ratio is defined as the number
            of samples in the minority class over the the number of samples
            in the majority class.

        random_state : int or None, optional (default=None)
            Seed for random number generation.

        verbose : bool, optional (default=True)
            Whether or not to print information about the processing.

        k : int, optional (default=5)
            Number of nearest neighbours to used to construct synthetic
            samples.

        m : int, optional (default=10)
            Number of nearest neighbours to use to determine if a minority
            sample is in danger.

        out_step : float, optional (default=0.5)
            Step size when extrapolating.

        kind : str, optional (default='regular')
            The type of SMOTE algorithm to use one of the following
            options: 'regular', 'borderline1', 'borderline2', 'svm'.

        n_jobs : int, optional (default=-1)
            Number of threads to run the algorithm when it is possible.

        """
        super(SMOTE, self).__init__(ratio=ratio,
                                    random_state=random_state,
                                    verbose=verbose)

        # --- The type of smote
        # This object can perform regular smote over-sampling, borderline 1,
        # borderline 2 and svm smote. Since the algorithms are fairly simple
        # they share most methods.
        possible_kind = ('regular', 'borderline1', 'borderline2', 'svm')
        if kind in possible_kind:
            self.kind = kind
        else:
            raise ValueError('Unknown kind for SMOTE algorithm.')

        self.k = k
        self.m = m
        self.out_step = out_step
        self.verbose = verbose
        self.kwargs = kwargs
        self.n_jobs = n_jobs

        # --- NN object
        # Import the NN object from scikit-learn library. Since in the smote
        # variations we must first find samples that are in danger, we
        # initialize the NN object differently depending on the method chosen
        if kind == 'regular':
            # Regular smote does not look for samples in danger, instead it
            # creates synthetic samples directly from the k-th nearest
            # neighbours with not filtering
            self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
                                                      n_jobs=self.n_jobs)
        else:
            # Borderline1, 2 and SVM variations of smote must first look for
            # samples that could be considered noise and samples that live
            # near the boundary between the classes. Therefore, before
            # creating synthetic samples from the k-th nns, it first look
            # for m nearest neighbors to decide whether or not a sample is
            # noise or near the boundary.
            self.nearest_neighbour = NearestNeighbors(n_neighbors=self.m + 1,
                                                      n_jobs=self.n_jobs)

        # --- SVM smote
        # Unlike the borderline variations, the SVM variation uses the support
        # vectors to decide which samples are in danger (near the boundary).
        # Additionally it also introduces extrapolation for samples that are
        # considered safe (far from boundary) and interpolation for samples
        # in danger (near the boundary). The level of extrapolation is
        # controled by the out_step.
        if kind == 'svm':
            # Store SVM object with any parameters
            self.svm = SVC(random_state=self.random_state, **self.kwargs)

    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).fit(X, y)

        return self

    def _in_danger_noise(self, samples, y, kind='danger'):
        """Estimate if a set of sample are in danger or noise.

        Parameters
        ----------
        samples : ndarray, shape (n_samples, n_features)
            The samples to check if either they are in danger or not.

        y : ndarray, shape (n_samples, )
            The true label in order to check the neighbour labels.

        kind : str, optional (default='danger')
            The type of classification to use. Can be either:

            - If 'danger', check if samples are in danger,
            - If 'noise', check if samples are noise.

        Returns
        -------
        output : ndarray, shape (n_samples, )
            A boolean array where True refer to samples in danger or noise.

        """

        # Find the NN for each samples
        # Exclude the sample itself
        x = self.nearest_neighbour.kneighbors(samples,
                                              return_distance=False)[:, 1:]

        # Count how many NN belong to the minority class
        # Find the class corresponding to the label in x
        nn_label = (y[x] != self.min_c_).astype(int)
        # Compute the number of majority samples in the NN
        n_maj = np.sum(nn_label, axis=1)

        if kind == 'danger':
            # Samples are in danger for m/2 <= m' < m
            return np.bitwise_and(n_maj >= float(self.m) / 2., n_maj < self.m)
        elif kind == 'noise':
            # Samples are noise for m = m'
            return n_maj == self.m
        else:
            raise NotImplementedError

    def _make_samples(self,
                      X,
                      y_type,
                      nn_data,
                      nn_num,
                      n_samples,
                      step_size=1.):
        """A support function that returns artificial samples constructed along
        the line connecting nearest neighbours.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Points from which the points will be created.

        y_type : str or int
            The minority target value, just so the function can return the
            target values for the synthetic variables with correct length in
            a clear format.

        nn_data : ndarray, shape (n_samples_all, n_features)
            Data set carrying all the neighbours to be used

        nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
            The nearest neighbours of each sample in nn_data.

        n_samples : int
            The number of samples to generate.

        step_size : float, optional (default=1.)
            The step size to create samples.

        Returns
        -------
        X_new : ndarray, shape (n_samples_new, n_features)
            Synthetically generated samples.

        y_new : ndarray, shape (n_samples_new, )
            Target values for synthetic samples.

        """

        # Check the consistency of X
        X = check_array(X)

        # A matrix to store the synthetic samples
        X_new = np.zeros((n_samples, X.shape[1]))

        # Set seeds
        np.random.seed(self.random_state)
        seeds = np.random.randint(low=0,
                                  high=100 * len(nn_num.flatten()),
                                  size=n_samples)

        # Randomly pick samples to construct neighbours from
        np.random.seed(self.random_state)
        samples = np.random.randint(low=0,
                                    high=len(nn_num.flatten()),
                                    size=n_samples)

        # Loop over the NN matrix and create new samples
        for i, n in enumerate(samples):
            # NN lines relate to original sample, columns to its
            # nearest neighbours
            row, col = divmod(n, nn_num.shape[1])

            # Take a step of random size (0,1) in the direction of the
            # n nearest neighbours
            if self.random_state is None:
                np.random.seed(seeds[i])
            else:
                np.random.seed(self.random_state)
            step = step_size * np.random.uniform()

            # Construct synthetic sample
            X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])

        # The returned target vector is simply a repetition of the
        # minority label
        y_new = np.array([y_type] * len(X_new))

        if self.verbose:
            print("Generated {} new samples ...".format(len(X_new)))

        return X_new, y_new

    def sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).sample(X, y)

        # Define the number of sample to create
        # We handle only two classes problem for the moment.
        if self.ratio == 'auto':
            num_samples = (self.stats_c_[self.maj_c_] -
                           self.stats_c_[self.min_c_])
        else:
            num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) -
                              self.stats_c_[self.min_c_])

        # Start by separating minority class features and target values.
        X_min = X[y == self.min_c_]

        # If regular SMOTE is to be performed
        if self.kind == 'regular':

            # Print if verbose is true#
            if self.verbose:
                print('Finding the {} nearest neighbours...'.format(self.k))

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.
            self.nearest_neighbour.fit(X_min)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.
            nns = self.nearest_neighbour.kneighbors(X_min,
                                                    return_distance=False)[:,
                                                                           1:]

            # Print status if verbose is true
            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            X_new, y_new = self._make_samples(X_min, self.min_c_, X_min, nns,
                                              num_samples, 1.0)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            X_resampled = np.concatenate((X, X_new), axis=0)
            y_resampled = np.concatenate((y, y_new), axis=0)

            return X_resampled, y_resampled

        if self.kind == 'borderline1' or self.kind == 'borderline2':

            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # Find the NNs for all samples in the data set.
            self.nearest_neighbour.fit(X)

            if self.verbose:
                print("done!")

            # Boolean array with True for minority samples in danger
            danger_index = self._in_danger_noise(X_min, y, kind='danger')

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                if self.verbose:
                    print('There are no samples in danger. No borderline '
                          'synthetic samples created.')

                # All are safe, nothing to be done here.
                return X, y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour.fit(X_min)

            # nns...#
            nns = self.nearest_neighbour.kneighbors(X_min[danger_index],
                                                    return_distance=False)[:,
                                                                           1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                X_new, y_new = self._make_samples(X_min[danger_index],
                                                  self.min_c_, X_min, nns,
                                                  num_samples)

                # Concatenate the newly generated samples to the original
                # dataset
                X_resampled = np.concatenate((X, X_new), axis=0)
                y_resampled = np.concatenate((y, y_new), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour.set_params(
                    **{'n_neighbors': self.m + 1})

                return X_resampled, y_resampled

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                np.random.seed(self.random_state)
                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01
                fractions = beta(10, 10)

                # Only minority
                X_new_1, y_new_1 = self._make_samples(X_min[danger_index],
                                                      self.min_c_,
                                                      X_min,
                                                      nns,
                                                      int(fractions *
                                                          (num_samples + 1)),
                                                      step_size=1.)

                # Only majority with smaller step size
                X_new_2, y_new_2 = self._make_samples(X_min[danger_index],
                                                      self.min_c_,
                                                      X[y != self.min_c_],
                                                      nns,
                                                      int((1 - fractions) *
                                                          num_samples),
                                                      step_size=0.5)

                # Concatenate the newly generated samples to the original
                # data set
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour.set_params(
                    **{'n_neighbors': self.m + 1})

                return X_resampled, y_resampled

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.

            # Fit SVM to the full data#
            self.svm.fit(X, y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm.support_[y[self.svm.support_] ==
                                              self.min_c_]
            support_vector = X[support_index]

            # First, find the nn of all the samples to identify samples
            # in danger and noisy ones
            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # As usual, fit a nearest neighbour model to the data
            self.nearest_neighbour.fit(X)

            if self.verbose:
                print("done!")

            # Now, get rid of noisy support vectors

            noise_bool = self._in_danger_noise(support_vector, y, kind='noise')

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]
            danger_bool = self._in_danger_noise(support_vector,
                                                y,
                                                kind='danger')
            safety_bool = np.logical_not(danger_bool)

            if self.verbose:
                print("Out of {0} support vectors, {1} are noisy, "
                      "{2} are in danger "
                      "and {3} are safe.".format(
                          support_vector.shape[0],
                          noise_bool.sum().astype(int),
                          danger_bool.sum().astype(int),
                          safety_bool.sum().astype(int)))

                # Proceed to find support vectors NNs among the minority class
                print("Finding the {} nearest neighbours...".format(self.k))

            self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour.fit(X_min)

            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            np.random.seed(self.random_state)
            fractions = beta(10, 10)

            # Interpolate samples in danger
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nearest_neighbour.kneighbors(
                    support_vector[danger_bool], return_distance=False)[:, 1:]

                X_new_1, y_new_1 = self._make_samples(
                    support_vector[danger_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int(fractions * (num_samples + 1)),
                    step_size=1.)

            # Extrapolate safe samples
            if np.count_nonzero(safety_bool) > 0:
                nns = self.nearest_neighbour.kneighbors(
                    support_vector[safety_bool], return_distance=False)[:, 1:]

                X_new_2, y_new_2 = self._make_samples(
                    support_vector[safety_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int((1 - fractions) * num_samples),
                    step_size=-self.out_step)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            if (np.count_nonzero(danger_bool) > 0
                    and np.count_nonzero(safety_bool) > 0):
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)
            # not any support vectors in danger
            elif np.count_nonzero(danger_bool) == 0:
                X_resampled = np.concatenate((X, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_2), axis=0)
            # All the support vector in danger
            elif np.count_nonzero(safety_bool) == 0:
                X_resampled = np.concatenate((X, X_new_1), axis=0)
                y_resampled = np.concatenate((y, y_new_1), axis=0)

            # Reset the k-neighbours to m+1 neighbours
            self.nearest_neighbour.set_params(**{'n_neighbors': self.m + 1})

            return X_resampled, y_resampled
class KnnRecommender:
    """
    This is an item-based collaborative filtering recommender with
    KNN implmented by sklearn
    """
    def __init__(self):
        self.model = NearestNeighbors()
        self.item_user_mat_sparse, self.hashmap = self._prep_data()
        self.set_model_params(10, 'brute', 'cosine', -1)

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
        """
        set model params for sklearn.neighbors.NearestNeighbors
        Parameters
        ----------
        n_neighbors: int, optional (default = 5)
        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        metric: string or callable, default 'minkowski', or one of
            ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
        n_jobs: int or None, optional (default=None)
        """
        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(
            **{
                'n_neighbors': n_neighbors,
                'algorithm': algorithm,
                'metric': metric,
                'n_jobs': n_jobs
            })

    def _prep_data(self):
        """
        prepare data for recommender
        1. item-user scipy sparse matrix
        2. hashmap of itemId to row index in item-user scipy sparse matrix
        """
        connection = create_connection()

        with connection:
            # read data
            cursor = connection.cursor()
            cursor.execute("SELECT * FROM review")
            reviews = cursor.fetchall()
            cursor.close()

            columns = ['id', 'userId', 'itemId', 'rating', 'reviewTime']
            df_ratings = pd.DataFrame(reviews, columns=columns)
            # pivot and create movie-user matrix
            item_user_mat = df_ratings.pivot(index='itemId',
                                             columns='userId',
                                             values='rating').fillna(0)
            # hashmap of itemId to row index in item-user scipy sparse matrix
            hashmap = {}
            index = 0
            for i in item_user_mat.index:
                hashmap[index] = i
                index = index + 1

            # transform matrix to scipy sparse matrix
            item_user_mat_sparse = csr_matrix(item_user_mat.values)

            # clean up
            del df_ratings, item_user_mat
            return item_user_mat_sparse, hashmap

    def _inference(self, model, data, itemId, n_recommendations):
        """
        return top n similar item recommendations
        Parameters
        ----------
        model: sklearn model, knn model
        data: item-user matrix
        itemId: id of item in matrix
        n_recommendations: int, top n recommendations
        Return
        ------
        list of top n similar item recommendations
        """
        # fit
        model.fit(data)

        # inference
        distances, indices = model.kneighbors(data[itemId],
                                              n_neighbors=n_recommendations +
                                              1)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]

        # return recommendation (itemId, distance)
        return raw_recommends

    def make_recommendations(self, itemId, n_recommendations):
        """
        make top n movie recommendations
        Parameters
        ----------
        itemId: raw id of item
        n_recommendations: int, top n recommendations
        """
        reverse_hashmap = {v: k for k, v in self.hashmap.items()}
        idx = reverse_hashmap[itemId]
        # get recommendation
        raw_recommends = self._inference(self.model, self.item_user_mat_sparse,
                                         idx, n_recommendations)
        # print results
        recommended_item_id = []
        # print('Recommendations for {}:'.format(itemId))
        for i, (idx, dist) in enumerate(raw_recommends):
            # print('{0}: {1}, with distance '
            #       'of {2}'.format(i + 1, self.hashmap[idx], dist))
            recommended_item_id.insert(0, self.hashmap[idx])
        del raw_recommends
        return recommended_item_id
Example #20
0
def networkBuildKnn(
    X_net,
    Y_net,
    knn=5,
    e_percentile=None,
    class_connected=False,
    metric="euclidean",
    neighbors=True,
    colors=[],
):
    g = nx.Graph()
    g.graph["knn"] = knn
    g.graph["e_percentile"] = e_percentile
    g.graph["class_connected"] = class_connected
    g.graph["metric"] = metric
    g.graph["neighbors"] = neighbors

    lnNet = len(X_net)
    g.graph["class_names"] = list(set(Y_net))
    g.graph["colors"] = colors
    class_nodes = [[] for i in g.graph["class_names"]]

    for index, instance in enumerate(X_net):
        label = Y_net[index]
        index_label = g.graph["class_names"].index(label)
        class_nodes[index_label].append(str(index))
        g.add_node(str(index), value=instance, type_node="net", label=label)
    g.graph["class_nodes"] = class_nodes

    values = X_net
    if values.ndim == 1:
        values = np.reshape(values, (-1, 1))

    nbrs = NearestNeighbors(n_neighbors=knn + 1, metric=metric)
    nbrs.fit(values)

    distances, indices = nbrs.kneighbors(values)
    indices = indices[:, 1:]
    distances = distances[:, 1:]

    for indice_node, indices_node in enumerate(indices):
        for tmpi, indice in enumerate(indices_node):
            if (g.nodes()[str(indice)]["label"]
                    == g.nodes()[str(indice_node)]["label"]
                    or class_connected):
                g.add_edge(str(indice),
                           str(indice_node),
                           weight=distances[indice_node][tmpi])

    if not e_percentile == None:
        eRadius = np.quantile(distances, e_percentile)
        nbrs.set_params(radius=eRadius)
        distances, indices = nbrs.radius_neighbors(values)

        for indice_node, indices_node in enumerate(indices):
            for tmpi, indice in enumerate(indices_node):
                if not str(indice) == str(indice_node):
                    if (g.nodes()[str(indice)]["label"]
                            == g.nodes()[str(indice_node)]["label"]
                            or class_connected):
                        g.add_edge(
                            str(indice),
                            str(indice_node),
                            weight=distances[indice_node][tmpi],
                        )
    g.graph["index"] = lnNet
    if neighbors:
        g.graph["nbrs"] = nbrs

    return g
class KnnRecommender:
    def __init__(self, path_movies, path_ratings):
        self.path_movies = path_movies
        self.path_ratings = path_ratings
        self.movie_rating_thres = 0
        self.user_rating_thres = 0
        self.model = NearestNeighbors()

    def set_filter_params(self, movie_rating_thres, user_rating_thres):
        self.movie_rating_thres = movie_rating_thres
        self.user_rating_thres = user_rating_thres

    def _prep_data(self):
        myclient = pymongo.MongoClient("mongodb://*****:*****@self.movie_rating_thres').index))  # noqa
        movies_filter = ratings_df.movieId.isin(popular_movies).values

        df_users_cnt = pd.DataFrame(ratings_df.groupby('userId').size(),
                                    columns=['count'])

        active_users = list(
            set(df_users_cnt.query(
                'count >= @self.user_rating_thres').index))  # noqa
        users_filter = ratings_df.userId.isin(active_users).values

        df_ratings_filtered = ratings_df[movies_filter & users_filter]

        # pivot and create movie-user matrix
        movie_user_mat = df_ratings_filtered.pivot(index='movieId',
                                                   columns='userId',
                                                   values='rating').fillna(0)
        # create mapper from movie title to index
        hashmap = {
            movie: i
            for i, movie in enumerate(
                list(
                    movies_df.set_index('movieId').loc[
                        movie_user_mat.index].title))  # noqa
        }
        # transform matrix to scipy sparse matrix
        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

        # clean up
        # del df_movies, df_movies_cnt, df_users_cnt
        # del df_ratings, df_ratings_filtered, movie_user_mat
        # gc.collect()
        # print("Movie user sparse matrix \n",movie_user_mat_sparse)
        # print("Hashmap \n",hashmap)
        return movie_user_mat_sparse, hashmap

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(
            **{
                'n_neighbors': n_neighbors,
                'algorithm': algorithm,
                'metric': metric,
                'n_jobs': n_jobs
            })

    def _fuzzy_matching(self, hashmap, fav_movie):
        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap, fav_movie, n_recommendations):
        # fit
        if (os.path.exists('knnpickle_file')):
            print("Model exist")
            model = pickle.load(open('knnpickle_file', 'rb'))
        else:
            print("Model doesn't exist")
            model.fit(data)
            knnPickle = open('knnpickle_file', 'wb')
            # source, destination
            pickle.dump(model, knnPickle)

        # get input movie index
        print('You have input movie:', fav_movie)
        idx = self._fuzzy_matching(hashmap, fav_movie)
        # print(data[idx])
        # inference
        print('Recommendation system start to make inference')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(data[idx],
                                              n_neighbors=n_recommendations +
                                              1)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        print('It took my system {:.2f}s to make inference \n\
              '.format(time.time() - t0))
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, fav_movie, n_recommendations):
        # get data
        movie_user_mat_sparse, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(self.model, movie_user_mat_sparse,
                                         hashmap, fav_movie, n_recommendations)
        # print(raw_recommends)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        # print('Recommendations for {}:'.format(fav_movie))
        tempList = []
        for i, (idx, dist) in enumerate(raw_recommends):
            tempList.append(reverse_hashmap[idx])
            # print('{0}: {1}, with distance '
            #       'of {2}'.format(i+1, reverse_hashmap[idx], dist))

        return tempList
Example #22
0
class SMOTE(OverSampler):
    """Class to perform over-sampling using SMOTE.

    This object is an implementation of SMOTE - Synthetic Minority
    Over-sampling Technique, and the variants Borderline SMOTE 1, 2 and
    SVM-SMOTE.

    Parameters
    ----------
    ratio : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balance
        the dataset. Otherwise, the ratio is defined as the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int or None, optional (default=None)
        Seed for random number generation.

    verbose : bool, optional (default=True)
        Whether or not to print information about the processing.

    k : int, optional (default=5)
        Number of nearest neighbours to used to construct synthetic samples.

    m : int, optional (default=10)
        Number of nearest neighbours to use to determine if a minority sample
        is in danger.

    out_step : float, optional (default=0.5)
        Step size when extrapolating.

    kind : str, optional (default='regular')
        The type of SMOTE algorithm to use one of the following options:
        'regular', 'borderline1', 'borderline2', 'svm'.

    Attributes
    ----------
    ratio : str or float
        If 'auto', the ratio will be defined automatically to balance
        the dataset. Otherwise, the ratio is defined as the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int or None
        Seed for random number generation.

    min_c_ : str or int
        The identifier of the minority class.

    max_c_ : str or int
        The identifier of the majority class.

    stats_c_ : dict of str/int : int
        A dictionary in which the number of occurences of each class is
        reported.

    X_shape_ : tuple of int
        Shape of the data `X` during fitting.

    Notes
    -----
    See the original papers: [1]_, [2]_, [3]_ for more details.

    It does not support multiple classes automatically, but can be called
    multiple times.

    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
       synthetic minority over-sampling technique," Journal of artificial
       intelligence research, 321-357, 2002.

    .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
       over-sampling method in imbalanced data sets learning," Advances in
       intelligent computing, 878-887, 2005.

    .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for
       imbalanced data classification," International Journal of Knowledge
       Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001.

    """

    def __init__(self, ratio='auto', random_state=None, verbose=True,
                 k=5, m=10, out_step=0.5, kind='regular', n_jobs=-1, **kwargs):
        """Initialisation of SMOTE object.

        Parameters
        ----------
        ratio : str or float, optional (default='auto')
            If 'auto', the ratio will be defined automatically to balance
            the dataset. Otherwise, the ratio is defined as the number
            of samples in the minority class over the the number of samples
            in the majority class.

        random_state : int or None, optional (default=None)
            Seed for random number generation.

        verbose : bool, optional (default=True)
            Whether or not to print information about the processing.

        k : int, optional (default=5)
            Number of nearest neighbours to used to construct synthetic
            samples.

        m : int, optional (default=10)
            Number of nearest neighbours to use to determine if a minority
            sample is in danger.

        out_step : float, optional (default=0.5)
            Step size when extrapolating.

        kind : str, optional (default='regular')
            The type of SMOTE algorithm to use one of the following
            options: 'regular', 'borderline1', 'borderline2', 'svm'.

        n_jobs : int, optional (default=-1)
            Number of threads to run the algorithm when it is possible.

        """
        super(SMOTE, self).__init__(ratio=ratio,
                                    random_state=random_state,
                                    verbose=verbose)

        # --- The type of smote
        # This object can perform regular smote over-sampling, borderline 1,
        # borderline 2 and svm smote. Since the algorithms are fairly simple
        # they share most methods.
        possible_kind = ('regular', 'borderline1', 'borderline2', 'svm')
        if kind in possible_kind:
            self.kind = kind
        else:
            raise ValueError('Unknown kind for SMOTE algorithm.')

        self.k = k
        self.m = m
        self.out_step = out_step
        self.verbose = verbose
        self.kwargs = kwargs
        self.n_jobs = n_jobs

        # --- NN object
        # Import the NN object from scikit-learn library. Since in the smote
        # variations we must first find samples that are in danger, we
        # initialize the NN object differently depending on the method chosen
        if kind == 'regular':
            # Regular smote does not look for samples in danger, instead it
            # creates synthetic samples directly from the k-th nearest
            # neighbours with not filtering
            self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
                                                      n_jobs=self.n_jobs)
        else:
            # Borderline1, 2 and SVM variations of smote must first look for
            # samples that could be considered noise and samples that live
            # near the boundary between the classes. Therefore, before
            # creating synthetic samples from the k-th nns, it first look
            # for m nearest neighbors to decide whether or not a sample is
            # noise or near the boundary.
            self.nearest_neighbour = NearestNeighbors(n_neighbors=self.m + 1,
                                                      n_jobs=self.n_jobs)

        # --- SVM smote
        # Unlike the borderline variations, the SVM variation uses the support
        # vectors to decide which samples are in danger (near the boundary).
        # Additionally it also introduces extrapolation for samples that are
        # considered safe (far from boundary) and interpolation for samples
        # in danger (near the boundary). The level of extrapolation is
        # controled by the out_step.
        if kind == 'svm':
            # Store SVM object with any parameters
            self.svm = SVC(random_state=self.random_state, **self.kwargs)

    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).fit(X, y)

        return self

    def _in_danger_noise(self, samples, y, kind='danger'):
        """Estimate if a set of sample are in danger or noise.

        Parameters
        ----------
        samples : ndarray, shape (n_samples, n_features)
            The samples to check if either they are in danger or not.

        y : ndarray, shape (n_samples, )
            The true label in order to check the neighbour labels.

        kind : str, optional (default='danger')
            The type of classification to use. Can be either:

            - If 'danger', check if samples are in danger,
            - If 'noise', check if samples are noise.

        Returns
        -------
        output : ndarray, shape (n_samples, )
            A boolean array where True refer to samples in danger or noise.

        """

        # Find the NN for each samples
        # Exclude the sample itself
        x = self.nearest_neighbour.kneighbors(samples,
                                              return_distance=False)[:, 1:]

        # Count how many NN belong to the minority class
        # Find the class corresponding to the label in x
        nn_label = (y[x] != self.min_c_).astype(int)
        # Compute the number of majority samples in the NN
        n_maj = np.sum(nn_label, axis=1)

        if kind == 'danger':
            # Samples are in danger for m/2 <= m' < m
            return np.bitwise_and(n_maj >= float(self.m) / 2.,
                                  n_maj < self.m)
        elif kind == 'noise':
            # Samples are noise for m = m'
            return n_maj == self.m
        else:
            raise NotImplementedError

    def _make_samples(self, X, y_type, nn_data, nn_num, n_samples,
                      step_size=1.):
        """A support function that returns artificial samples constructed along
        the line connecting nearest neighbours.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Points from which the points will be created.

        y_type : str or int
            The minority target value, just so the function can return the
            target values for the synthetic variables with correct length in
            a clear format.

        nn_data : ndarray, shape (n_samples_all, n_features)
            Data set carrying all the neighbours to be used

        nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
            The nearest neighbours of each sample in nn_data.

        n_samples : int
            The number of samples to generate.

        step_size : float, optional (default=1.)
            The step size to create samples.

        Returns
        -------
        X_new : ndarray, shape (n_samples_new, n_features)
            Synthetically generated samples.

        y_new : ndarray, shape (n_samples_new, )
            Target values for synthetic samples.

        """

        # Check the consistency of X
        X = check_array(X)

        # A matrix to store the synthetic samples
        X_new = np.zeros((n_samples, X.shape[1]))

        # Set seeds
        np.random.seed(self.random_state)
        seeds = np.random.randint(low=0,
                                  high=100*len(nn_num.flatten()),
                                  size=n_samples)

        # Randomly pick samples to construct neighbours from
        np.random.seed(self.random_state)
        samples = np.random.randint(low=0,
                                    high=len(nn_num.flatten()),
                                    size=n_samples)

        # Loop over the NN matrix and create new samples
        for i, n in enumerate(samples):
            # NN lines relate to original sample, columns to its
            # nearest neighbours
            row, col = divmod(n, nn_num.shape[1])

            # Take a step of random size (0,1) in the direction of the
            # n nearest neighbours
            if self.random_state is None:
                np.random.seed(seeds[i])
            else:
                np.random.seed(self.random_state)
            step = step_size * np.random.uniform()

            # Construct synthetic sample
            X_new[i] = X[row] - step * (X[row] -
                                        nn_data[nn_num[row, col]])

        # The returned target vector is simply a repetition of the
        # minority label
        y_new = np.array([y_type] * len(X_new))

        if self.verbose:
            print("Generated {} new samples ...".format(len(X_new)))

        return X_new, y_new

    def sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).sample(X, y)

        # Define the number of sample to create
        # We handle only two classes problem for the moment.
        if self.ratio == 'auto':
            num_samples = (self.stats_c_[self.maj_c_] -
                           self.stats_c_[self.min_c_])
        else:
            num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) -
                              self.stats_c_[self.min_c_])

        # Start by separating minority class features and target values.
        X_min = X[y == self.min_c_]

        # If regular SMOTE is to be performed
        if self.kind == 'regular':

            # Print if verbose is true#
            if self.verbose:
                print('Finding the {} nearest neighbours...'.format(self.k))

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.
            self.nearest_neighbour.fit(X_min)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.
            nns = self.nearest_neighbour.kneighbors(
                X_min,
                return_distance=False)[:, 1:]

            # Print status if verbose is true
            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            X_new, y_new = self._make_samples(X_min,
                                              self.min_c_,
                                              X_min,
                                              nns,
                                              num_samples,
                                              1.0)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            X_resampled = np.concatenate((X, X_new), axis=0)
            y_resampled = np.concatenate((y, y_new), axis=0)

            return X_resampled, y_resampled

        if self.kind == 'borderline1' or self.kind == 'borderline2':

            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # Find the NNs for all samples in the data set.
            self.nearest_neighbour.fit(X)

            if self.verbose:
                print("done!")

            # Boolean array with True for minority samples in danger
            danger_index = self._in_danger_noise(X_min, y, kind='danger')

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                if self.verbose:
                    print('There are no samples in danger. No borderline '
                          'synthetic samples created.')

                # All are safe, nothing to be done here.
                return X, y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour.fit(X_min)

            # nns...#
            nns = self.nearest_neighbour.kneighbors(
                X_min[danger_index],
                return_distance=False)[:, 1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                X_new, y_new = self._make_samples(X_min[danger_index],
                                                  self.min_c_,
                                                  X_min,
                                                  nns,
                                                  num_samples)

                # Concatenate the newly generated samples to the original
                # dataset
                X_resampled = np.concatenate((X, X_new), axis=0)
                y_resampled = np.concatenate((y, y_new), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour.set_params(**{'n_neighbors': self.m+1})

                return X_resampled, y_resampled

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                np.random.seed(self.random_state)
                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01
                fractions = beta(10, 10)

                # Only minority
                X_new_1, y_new_1 = self._make_samples(X_min[danger_index],
                                                      self.min_c_,
                                                      X_min,
                                                      nns,
                                                      int(fractions *
                                                          (num_samples + 1)),
                                                      step_size=1.)

                # Only majority with smaller step size
                X_new_2, y_new_2 = self._make_samples(X_min[danger_index],
                                                      self.min_c_,
                                                      X[y != self.min_c_],
                                                      nns,
                                                      int((1 - fractions) *
                                                          num_samples),
                                                      step_size=0.5)

                # Concatenate the newly generated samples to the original
                # data set
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour.set_params(**{'n_neighbors': self.m+1})

                return X_resampled, y_resampled

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.

            # Fit SVM to the full data#
            self.svm.fit(X, y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm.support_[y[self.svm.support_] ==
                                              self.min_c_]
            support_vector = X[support_index]

            # First, find the nn of all the samples to identify samples
            # in danger and noisy ones
            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # As usual, fit a nearest neighbour model to the data
            self.nearest_neighbour.fit(X)

            if self.verbose:
                print("done!")

            # Now, get rid of noisy support vectors

            noise_bool = self._in_danger_noise(support_vector, y, kind='noise')

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]
            danger_bool = self._in_danger_noise(support_vector, y,
                                                kind='danger')
            safety_bool = np.logical_not(danger_bool)

            if self.verbose:
                print("Out of {0} support vectors, {1} are noisy, "
                      "{2} are in danger "
                      "and {3} are safe.".format(support_vector.shape[0],
                                                 noise_bool.sum().astype(int),
                                                 danger_bool.sum().astype(int),
                                                 safety_bool.sum().astype(int)
                                                 ))

                # Proceed to find support vectors NNs among the minority class
                print("Finding the {} nearest neighbours...".format(self.k))

            self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour.fit(X_min)

            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            np.random.seed(self.random_state)
            fractions = beta(10, 10)

            # Interpolate samples in danger
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nearest_neighbour.kneighbors(
                    support_vector[danger_bool],
                    return_distance=False)[:, 1:]

                X_new_1, y_new_1 = self._make_samples(
                    support_vector[danger_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int(fractions * (num_samples + 1)),
                    step_size=1.)

            # Extrapolate safe samples
            if np.count_nonzero(safety_bool) > 0:
                nns = self.nearest_neighbour.kneighbors(
                    support_vector[safety_bool],
                    return_distance=False)[:, 1:]

                X_new_2, y_new_2 = self._make_samples(
                    support_vector[safety_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int((1 - fractions) * num_samples),
                    step_size=-self.out_step)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            if (np.count_nonzero(danger_bool) > 0 and
                    np.count_nonzero(safety_bool) > 0):
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)
            # not any support vectors in danger
            elif np.count_nonzero(danger_bool) == 0:
                X_resampled = np.concatenate((X, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_2), axis=0)
            # All the support vector in danger
            elif np.count_nonzero(safety_bool) == 0:
                X_resampled = np.concatenate((X, X_new_1), axis=0)
                y_resampled = np.concatenate((y, y_new_1), axis=0)

            # Reset the k-neighbours to m+1 neighbours
            self.nearest_neighbour.set_params(**{'n_neighbors': self.m+1})

            return X_resampled, y_resampled
Example #23
0
class KNNRegressor:

    def __init__(self, k, strategy='my_own', metric='euclidean', mode='uniform'):
        if not isinstance(k, int) or k < 1:
            raise AttributeError('Incorrect "k" parameter')
        if not isinstance(mode, str) or mode != 'uniform' and mode != 'distance':
            raise AttributeError('Mode parameter can be uniform or distance only')

        self.mode = mode
        self.k = k
        self.strategy = strategy
        self.metric = metric
        self.training_labels = None
        if strategy == 'my_own':
            self.training_data = None
        else:
            self.nn = NearestNeighbors(n_neighbors=k, algorithm=strategy, leaf_size=30, metric=metric)

    def fit(self, x, y):
        if x.shape[0] != y.shape[0]:
            raise AttributeError('Mismatch between training set and its labels')

        self.training_labels = y
        if self.strategy == 'my_own':
            self.training_data = x
        else:
            self.nn.fit(x)

    def find_kneighbors(self, x, return_distance=True):
        if not isinstance(return_distance, bool):
            raise AttributeError('Incorrect "return_distance" parameter')

        if self.strategy == 'my_own':
            if self.metric == 'euclidean':
                dist_matrix = distances.euclidean_distance(x, self.training_data)
            elif self.metric == 'cosine':
                dist_matrix = distances.cosine_distance(x, self.training_data)
            else:
                dist_matrix = self.metric(self.training_data, x).astype(np.float64).T
            if not return_distance:
                res_index = np.empty(dist_matrix.shape[0], dtype=np.int64)
                tmp_index = np.empty(dist_matrix.shape[0], dtype=np.int64)
                np.argmin(dist_matrix, axis=1, out=res_index)
                dist_matrix[np.arange(dist_matrix.shape[0]), res_index] = np.inf
                res_index = res_index.reshape((-1, 1))
                for i in range(self.k - 1):
                    np.argmin(dist_matrix, axis=1, out=tmp_index)
                    dist_matrix[np.arange(dist_matrix.shape[0]), tmp_index] = np.inf
                    res_index = np.hstack((res_index, tmp_index[:, np.newaxis]))
                return res_index
            else:
                res_index = np.empty(dist_matrix.shape[0], dtype=np.int64)
                tmp_index = np.empty(dist_matrix.shape[0], dtype=np.int64)
                np.argmin(dist_matrix, axis=1, out=res_index)
                res_dist = dist_matrix[np.arange(dist_matrix.shape[0]), res_index]
                dist_matrix[np.arange(dist_matrix.shape[0]), res_index] = np.inf
                res_index = res_index.reshape((-1, 1))
                res_dist = res_dist.reshape((-1, 1))
                for i in range(self.k - 1):
                    np.argmin(dist_matrix, axis=1, out=tmp_index)
                    res_dist = np.hstack((res_dist,
                                          dist_matrix[np.arange(dist_matrix.shape[0]), tmp_index][:, np.newaxis]))
                    dist_matrix[np.arange(dist_matrix.shape[0]), tmp_index] = np.inf
                    res_index = np.hstack((res_index, tmp_index[:, np.newaxis]))
                return res_dist, res_index
        else:
            return self.nn.kneighbors(x, return_distance=return_distance)

    def predict(self, x, k=None):
        if k is not None:
            if not isinstance(k, int) or k < 1:
                raise AttributeError('Incorrect "k" parameter')
            else:
                if self.strategy == 'my_own':
                    self.k = k
                else:
                    params = self.nn.get_params()
                    params['n_neighbors'] = k
                    self.nn = self.nn.set_params(**params)

        if self.mode == 'uniform':
            nn_index = self.training_labels[self.find_kneighbors(x, return_distance=False)]
            return np.mean(nn_index, axis=1)
        else:
            vec_weight = np.vectorize(lambda z: 1 / (z + 0.00001))
            nn_dist, nn_index = self.find_kneighbors(x)
            nn_index = self.training_labels[nn_index]
            nn_dist = vec_weight(nn_dist)
            return np.sum(nn_index * nn_dist, axis=1) / np.sum(nn_dist, axis=1)
Example #24
0
class KnnRecommender:
    """
    This is an item-based collaborative filtering recommender with
    KNN implmented by sklearn
    """
    def __init__(self, train_file, test_file, do_five_fold_cs):

        self.train_file = train_file
        self.test_file = test_file
        self.do_five_fold_cs = do_five_fold_cs
        self.model = NearestNeighbors()

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):

        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(
            **{
                'n_neighbors': n_neighbors,
                'algorithm': algorithm,
                'metric': metric,
                'n_jobs': n_jobs
            })

    def make_recommendations(self, n_recommendations):

        # get data
        data_train, data_test, hashmap, _ = du.read_train_and_val_data_to_index(
            self.train_file, self.test_file)
        train_data_one_hot = du.to_one_hot_train(data_train, len(hashmap))
        self.model.fit(train_data_one_hot)

        test_one_hot_gen = du.to_one_hot_with_gt_generator(
            data_test, len(hashmap), False)
        correct = 0
        for i in range(len(data_test)):
            test_case, gt = next(test_one_hot_gen)
            test_case = test_case.reshape((1, -1))
            distances, indices = self.model.kneighbors(test_case,
                                                       n_neighbors=100)
            distances = distances.flatten()
            indices = indices.flatten()
            test_case = test_case.flatten().astype(np.float)
            icd_pred = []
            case_pred = np.zeros_like(test_case)
            for j, idx in enumerate(indices):
                case_pred += (
                    (train_data_one_hot[idx, :].astype(np.float) - test_case) /
                    distances[j])

            pred_idx = case_pred.argsort()[-5:][::-1]
            for idx in pred_idx:
                icd_pred.append(hashmap[idx])
            gt_idx = np.array(np.where(gt == 1)).item(0)
            gt_icd = hashmap[gt_idx]
            c = False
            if gt_icd in icd_pred:
                c = True
                correct += 1
            print(
                str(i) + " Predicted: " + str(icd_pred) + "   GT: " + gt_icd +
                "   " + str(c) + "   " + str(float(correct) / float(i + 1)))

        print("Top 5 Acc: " + str(float(correct) / float(len(data_test))))
Example #25
0
class KnnClass:
    def __init__(self, movies_path, ratings_path):

        # Path for movie csv containing movies data
        self.movies_path = movies_path
        # Path for ratings csv containing ratings data
        self.ratings_path = ratings_path
        # Movie rating (0-5)
        self.movie_rating_thres = 0
        # # of User raiting for a movie
        self.user_rating_thres = 0
        self.model = NearestNeighbors()
        # Create t0 to calculate estimated finish time
        self.t0 = 0

    def SetFilterParams(self, movie_rating_thres, user_rating_thres):

        # Set movie and user rating frequency threshold
        self.movie_rating_thres = movie_rating_thres
        self.user_rating_thres = user_rating_thres

        # Start the timer
        self.t0 = time.time()

    def SetModelParams(self, n_neighbors, algorithm, metric, jobs=None):

        # Setting up the model parameters for the sklearn NearestNeighbors
        if jobs and (jobs > 1 or jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(
            **{
                'n_neighbors': n_neighbors,
                'algorithm': algorithm,
                'metric': metric,
                'n_jobs': jobs
            })

    def PrepareData(self):

        ### Prepate the data for the recommender

        # Read the data from movies csv
        movies = pd.read_csv(os.path.join(self.movies_path),
                             usecols=['movieId', 'title'],
                             dtype={
                                 'movieId': 'int32',
                                 'title': 'str'
                             })
        ratings = pd.read_csv(os.path.join(self.ratings_path),
                              usecols=['userId', 'movieId', 'rating'],
                              dtype={
                                  'userId': 'int32',
                                  'movieId': 'int32',
                                  'rating': 'float32'
                              })
        # Filter the data
        movies_count = pd.DataFrame(ratings.groupby('movieId').size(),
                                    columns=['count'])
        popular_movies = list(
            set(movies_count.query('count >= @self.movie_rating_thres').index))
        movies_filter = ratings.movieId.isin(popular_movies).values

        users_count = pd.DataFrame(ratings.groupby('userId').size(),
                                   columns=['count'])
        active_users = list(
            set(users_count.query('count >= @self.user_rating_thres').index))
        users_filter = ratings.userId.isin(active_users).values

        ratings_filtered = ratings[movies_filter & users_filter]

        # Move pivot and create movie/user matrix
        movie_user_mat = ratings_filtered.pivot(index='movieId',
                                                columns='userId',
                                                values='rating').fillna(0)
        # Create mapper from movie title to index
        hashmap = {
            movie: i
            for i, movie in enumerate(
                list(
                    movies.set_index('movieId').loc[
                        movie_user_mat.index].title))
        }
        # Transform matrix to scipy sparse matrix
        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

        # Cleam the memory
        del movies, movies_count, users_count
        del ratings, ratings_filtered, movie_user_mat
        gc.collect()
        return movie_user_mat_sparse, hashmap

    def FindMovieMatch(self, hashmap, user_move_input):

        ### Transform the movie name inputted by the user to lower case
        ### Map movie title name to index of the movie in data
        ### And use the fuzz library ratio function to find a match

        match = []
        # get match
        for move_title, index in hashmap.items():
            ratio = fuzz.ratio(move_title.lower(), user_move_input.lower())
            if ratio >= 60:
                match.append((move_title, index, ratio))
        # sort
        match = sorted(match, key=lambda x: x[2])[::-1]
        if not match:
            print('No match is found')
        else:
            print('Found matches in our database: '
                  '{0}\n'.format([x[0] for x in match]))
            return match[0][1]

    def FindData(self, model, data, hashmap, movie_chosen, n_recommendations):

        ### Return top movies that are similar to the user's movie input

        # Fit the data to our model
        model.fit(data)

        # Get movie index
        print('You have input movie:', movie_chosen)
        index = self.FindMovieMatch(hashmap, movie_chosen)
        # FindData
        print('Finding movies..')
        print('......\n')

        distances, indices = model.kneighbors(data[index],
                                              n_neighbors=n_recommendations +
                                              1)
        # Get list of raw index of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        self.timeNeeded = 'It took {:.2f}s to finish \n\
              '.format(time.time() - self.t0)

        # return recommendation (movieId, distance)
        return raw_recommends

    def Recommend(self, movie_chosen, recommendations_count):

        # Prepare the data, load the sparse matrix and the hashmap
        movieUserMatSparse, hashmap = self.PrepareData()
        # Find recommendations
        rawRecommends = self.FindData(self.model, movieUserMatSparse, hashmap,
                                      movie_chosen, recommendations_count)

        # Create the return string of the recommended movies
        recommended_movies = ""

        # Print the results
        reversed_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(movie_chosen))
        for i, (index, dist) in enumerate(rawRecommends):
            print('{0}: {1}, with distance '
                  'of {2}'.format(recommendations_count - i,
                                  reversed_hashmap[index], dist))

            recommended_movies = '{0}: {1}'.format(
                recommendations_count - i,
                reversed_hashmap[index]) + "\n" + recommended_movies

        return recommended_movies + '\n\n' + str(self.timeNeeded)
Example #26
0
class KnnRecommender:
    def __init__(self, path_movies, path_ratings):
        self.path_movies = path_movies
        self.path_ratings = path_ratings
        self.movie_rating_thres = 0
        self.user_rating_thres = 0
        self.model = NearestNeighbors()

    def _prep_data(self):
        df_movies = pd.read_csv('movies.csv',
                                usecols=['movieId', 'title'],
                                dtype={
                                    'movieId': 'int32',
                                    'title': 'str'
                                })

        df_movies = pd.read_csv('movies.csv',
                                usecols=['movieId', 'title'],
                                dtype={
                                    'movieId': 'int32',
                                    'title': 'str'
                                })

        df_ratings = pd.read_csv('ratings.csv',
                                 usecols=['userId', 'movieId', 'rating'],
                                 dtype={
                                     'userId': 'int32',
                                     'movieId': 'int32',
                                     'rating': 'float32'
                                 })

        # filter data
        # conta o numero de avaliações do filme
        df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(),
                                     columns=['count'])

        popular_movies = list(
            set(
                df_movies_cnt.query(
                    'count >= @self.movie_rating_thres').index))  # noqa
        movies_filter = df_ratings.movieId.isin(popular_movies).values

        df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(),
                                    columns=['count'])
        active_users = list(
            set(df_users_cnt.query(
                'count >= @self.user_rating_thres').index))  # noqa
        users_filter = df_ratings.userId.isin(active_users).values

        df_ratings_filtered = df_ratings[movies_filter & users_filter]

        # pivot and create movie-user matrix
        movie_user_mat = df_ratings_filtered.pivot(index='movieId',
                                                   columns='userId',
                                                   values='rating').fillna(0)

        # create mapper from movie title to index
        hashmap = {
            movie: i
            for i, movie in enumerate(
                list(
                    df_movies.set_index('movieId').loc[
                        movie_user_mat.index].title))  # noqa
        }

        # transform matrix to scipy sparse matrix
        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)
        # clean up
        del df_movies, df_movies_cnt, df_users_cnt
        del df_ratings, df_ratings_filtered, movie_user_mat
        gc.collect()

        return movie_user_mat_sparse, hashmap

    def set_filter_params(self, movie_rating_thres, user_rating_thres):
        """
        defina o limite de frequência de classificação para filtrar filmes menos conhecidos e usuários menos ativos

        Parameters
        ----------
        movie_rating_thres: int, número mínimo de classificações recebidas pelos usuários
        user_rating_thres: int, número mínimo de classificações que um usuário fornece
        """
        self.movie_rating_thres = movie_rating_thres
        self.user_rating_thres = user_rating_thres

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):

        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'

        self.model.set_params(
            **{
                'n_neighbors': n_neighbors,
                'algorithm': algorithm,
                'metric': metric,
                'n_jobs': n_jobs
            })

    def _fuzzy_matching(self, hashmap, fav_movie):
        # print("\nHASHMAP")
        # print(hashmap)

        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print(
                'Foram encontradas possíveis correspondências em nosso banco de dados: '
                '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap, fav_movie, n_recommendations):
        # fit
        # print("\nAQUI É DATA")
        # print(data)
        # print("\n")
        model.fit(data)
        # get input movie index
        print('You have input movie:', fav_movie)
        idx = self._fuzzy_matching(hashmap, fav_movie)
        print("\nIDX")
        print(idx)
        # inference
        print('Sistema de recomendação começa a fazer inferência')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(data[idx],
                                              n_neighbors=n_recommendations +
                                              1)
        # print("\nINDICES")
        # print(indices)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]

        print('O meu sistema {: .2f} s fez inferência \n\
              '.format(time.time() - t0))
        # print('\nRAW')
        # print(raw_recommends)
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, fav_movie, n_recommendations):
        filmesRecomendados = []
        # get data
        movie_user_mat_sparse, hashmap = self._prep_data()

        # get recommendations
        raw_recommends = self._inference(self.model, movie_user_mat_sparse,
                                         hashmap, fav_movie, n_recommendations)
        # print results

        # print(hashmap)
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        # print('Recomendação for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            # print('{0}: {1}, with distance '
            #   'of {2}'.format(i+1, reverse_hashmap[idx], dist))
            filmesRecomendados.append(reverse_hashmap[idx])
        # print("\nREVERSE")
        # print(reverse_hashmap)
        return filmesRecomendados
Example #27
0
class RecommendationSystem:
    """
    item-based collaborative filtering for movie-lens dataset using sklearn's nearest neighbors
    
    """
    def __init__(self,movie_th,user_th,movie_path,ratings_path,users_path):
        
        """
        Description:
            1. Initialization of threshold values
            2. Initialization of file paths
            3. Initialization of models
        
        """
        self.movie_th = movie_th
        self.user_th = user_th
        self.movies_path = movie_path
        self.ratings_path = ratings_path
        self.path = pathlib.Path().absolute() / 'ml-1m'
        self.users_path = users_path
        self.model = NearestNeighbors()
        self.data_matrix = None 
        self.dic_movie_name = None
        self.dic_movie_id = None
    def define_model_parameters(self,n_neighbors,algorithm,metric,jobs=1):
        
        #print(n_neighbors,algorithm,metric,jobs)
        """
        Args:
            n_neighbors : number of neighbors used for calculating similarity
            algorithm : type of algorithm used i.e brute force, KDTRee, BallTree for building model
            metric : how similarity is being measured i.e cosine, l1,l2 norm 
            jobs :  number of processors to be used in parallel
        """
        self.model.set_params(**{
            'n_neighbors':n_neighbors,
            'algorithm': algorithm,
            'metric': metric,
            'n_jobs' : jobs
        })
            
        
        
    def read_clean_prepare(self):
        
        
        
        self.ratings = pd.read_csv(
            self.path/self.ratings_path,
            sep='::',
            names=['userId','movieId','rating','timestamp'],
            usecols = ['userId','movieId','rating'],
            dtype= {'userId':np.int32,'movieId':np.int32,'rating':np.float32}
        )
        
        self.movies = pd.read_csv(
            self.path/self.movies_path,
            sep='::',
            names=['movieId','title','genres'],
            usecols = ['movieId','title'],
            dtype= {'movieId':np.int32,'title':str}
        )
        
        self.movies.title = self.movies.title.str.lower()
        self.movies['name'] = self.movies.title.str.replace(r'\(\d{4}\)',"")
        self.movies.name = self.movies.name.str.rstrip()
        self.dic_movie_name = dict([(row.movieId,row.title) for row in self.movies.itertuples()])
        self.dic_movie_id = dict([(row.name,row.movieId) for row in self.movies.itertuples()])
        temp = self.ratings.groupby('movieId').agg({'userId':len})
        self.pop_movies = temp[temp['userId']>=recommender.movie_th].index
        
        
        self.data_matrix = self.ratings.pivot(index = 'movieId',columns = 'userId', values = 'rating').fillna(0)
        
        self.data_matrix = csr_matrix(self.data_matrix)
        
        
        time.sleep(3)
        self.define_model_parameters(20,'brute','cosine')
        
        
        self.model.fit(self.data_matrix)
        
    def plot_rating_freq(self):
        temp = self.ratings.groupby('movieId').agg({'userId':len})

        #temp = temp.reset_index()
        temp = temp.sort_values(by=['userId'],ascending=False)
        temp.index = pd.RangeIndex(0,temp.shape[0])
        plt.plot(temp.index,temp['userId'])
        plt.title('movie rating frequency')
        plt.xlabel("movies")
        plt.ylabel("number of ratings")
        
        
    def recommend(self,fav,how_many):
        
        
        """
        Args:
        
            fav: name of favorite movie
            how_many : number of similar movies to be recommended
            
        Returns:
            
            Exits if the movie not in database else returns the name of the movies.
        
        """
        if fav not in self.dic_movie_id:
            print("Movie not found in the database\n")
            exit() 
        
        fav_id = self.dic_movie_id[fav]
        _,recommendations = self.model.kneighbors(self.data_matrix[fav_id],n_neighbors=how_many+1)
        
        recommendations = [self.dic_movie_name[x] for x in recommendations[0][1:]]
        #print(recommendations)
        print("Top {} recommendations are:\n".format(how_many))
        
        for i in range(how_many):
            
            print("{}. {}".format(i+1,recommendations[i]))
        return 
Example #28
0
class KnnRecommender:
    """
    This is an item-based collaborative filtering recommender with
    KNN implmented by sklearn
    """

    #定义初始方法 path_movies, path_ratings等参数传入
    def __init__(self, path_movies, path_ratings):

        self.path_movies = path_movies
        self.path_ratings = path_ratings
        self.movie_rating_thres = 0
        self.user_rating_thres = 0
        #这一步的操作就直接把model定义成了 NearestNeighbors类型,下面的model已经转换
        self.model = NearestNeighbors()

        self.train_data, self.test_data = self._prep_data()

    def set_filter_params(self, movie_rating_thres, user_rating_thres):
        """
        movie_rating_thres: int, minimum number of ratings received by users
        user_rating_thres: int, minimum number of ratings a user gives
        """
        #设置额定频率阈值以过滤不知名的电影(考虑前25%的电影,并且作为阈值)和不太活跃的用户(用户限制在前40%),
        self.movie_rating_thres = movie_rating_thres
        self.user_rating_thres = user_rating_thres

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
        """
        这一步作用:
        将NearestNeighbors类初始化为model_knn并将稀疏矩阵适合该实例。
        通过指定metric = cosine,模型将通过使用余弦相似度来测量艺术家矢量之间的相似度。
        设置sklearn.neighbors.NearestNeighbors的模型参数,参量
        ----------
        n_neighbors: int, optional (default = 5)
        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        metric: string or callable, default 'minkowski', or one of
            ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
        n_jobs: int or None, optional (default=None)
        """
        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(
            **{
                'n_neighbors': n_neighbors,
                'algorithm': algorithm,
                'metric': metric,
                'n_jobs': n_jobs
            })

    #装载数据
    def _prep_data(self):
        """
        prepare data for recommender
        1. movie-user scipy sparse matrix
        2. hashmap of movie to row index in movie-user scipy sparse matrix
        """
        # 读入数据,表之间的连接和处理
        df_movies = pd.read_csv(os.path.join(self.path_movies),
                                usecols=['movieId', 'title'],
                                dtype={
                                    'movieId': 'int32',
                                    'title': 'str'
                                })
        df_ratings = pd.read_csv(os.path.join(self.path_ratings),
                                 usecols=['userId', 'movieId', 'rating'],
                                 dtype={
                                     'userId': 'int32',
                                     'movieId': 'int32',
                                     'rating': 'float32'
                                 })
        # filter data
        df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(),
                                     columns=['count'])
        popular_movies = list(
            set(
                df_movies_cnt.query(
                    'count >= @self.movie_rating_thres').index))  # noqa
        movies_filter = df_ratings.movieId.isin(popular_movies).values

        df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(),
                                    columns=['count'])
        active_users = list(
            set(df_users_cnt.query(
                'count >= @self.user_rating_thres').index))  # noqa
        users_filter = df_ratings.userId.isin(active_users).values

        df_ratings_filtered = df_ratings[movies_filter & users_filter]

        # pivot and create movie-user matrix
        movie_user_mat = df_ratings_filtered.pivot(index='movieId',
                                                   columns='userId',
                                                   values='rating').fillna(0)
        # create mapper from movie title to index
        '''
        hashmap = {
            movie: i for i, movie in
            enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)) # noqa
        }
        # transform matrix to scipy sparse matrix 将矩阵转换为稀疏矩阵
        '''
        # 这里开始把上面的数据转化为稀疏矩阵,由于将要执行线性代数运算
        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

        # clean up
        del df_movies, df_movies_cnt, df_users_cnt
        del df_ratings, df_ratings_filtered, movie_user_mat
        gc.collect()

        #train和test数据集的建立 数据的分成 train70% 和 test30%  random隨機選一下
        train_data, test_data = movie_user_mat_sparse.randomSplit([0.7, 0.3])
        # 注意:上面movie_user_mat_sparse已经转化成了矩阵,这里train和test依然是矩阵
        return train_data, test_data

    #添加代码
    #定义交叉验证
    def Cross_validation(self):
        #数据使用train70%,执行交叉验证
        #为了直接找到最优的 n_neighbors 也就是 K值,直接使用 GridSearchCV 这个超调参数
        #GridSearchCV的工作原理是在我们指定的参数范围内多次训练我们的模型。这样,我们可以用每个参数来测试我们的模型,
        # 并找出最优值,以获得最佳的精度结果。超调参数找到您模型的最佳参数以提高准确性

        #create new a knn model
        knn2 = KNeighborsClassifier()
        #create a dictionary of all values we want to test for n_neighbors
        param_grid = {'n_neighbors': np.arange(1, 25)}
        #use gridsearch to test all values for n_neighbors
        #我们使用网格搜索的新模型将采用新的k-NN分类器,即param_grid和交叉验证值5,
        # 以便找到“ n_neighbors”的最佳值
        knn_gscv = GridSearchCV(knn2, param_grid, cv=5)

        # fit model to data
        # 這裡的X,Y 从上面的train_data,使用scikit_learn的方法得到
        X, Y = train_test_split(self.train_data,
                                test_size=0.2,
                                random_state=1,
                                stratify=y)
        # fit model to data
        knn_gscv.fit(X, Y)

        #check which of our values for ‘n_neighbors’ that we tested performed the best.
        # knn_gscv.best_params_会返回一个字典格式的数据{n_nerghbors: 14}
        # 最合适的K,就是個字典的value值
        k = knn_gscv.best_params_.value
        return k

    #再添加两个方法
    def accuracy(self):

        # 下面这一步:
        # check mean score for the top performing value of n_neighbors
        # best_score_’输出通过交叉验证获得的分数的平均准确性
        knn2 = KNeighborsClassifier()
        param_grid = {'n_neighbors': np.arange(1, 25)}
        knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
        knn_gscv.fit(self.train_data, self.test_data)
        # 下面这一步:
        # check mean score for the top performing value of n_neighbors
        # best_score_’输出通过交叉验证获得的分数的平均准确性
        accuracy = knn_gscv.best_score_
        return accuracy

    #下面开始做电影推荐,同等结构直接转化为目前项目,家具的推荐,有待修改相应的条件和参数
    def _fuzzy_matching(self, hashmap, fav_movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None
        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        Return
        ------
        index of the closest match
        """
        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap, fav_movie, n_recommendations):
        """
        return top n similar movie recommendations based on user's input movie
        Parameters 根据用户输入的电影返回前n个相似的电影推荐参量
        ----------
        model: sklearn model, knn model
        data: movie-user matrix
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        n_recommendations: int, top n recommendations
        Return
        ------
        list of top n similar movie recommendations
        """
        # fit
        model.fit(data)
        # get input movie index
        print('You have input movie:', fav_movie)
        idx = self._fuzzy_matching(hashmap, fav_movie)
        # inference
        print('Recommendation system start to make inference')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(data[idx],
                                              n_neighbors=n_recommendations +
                                              1)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        print('It took my system {:.2f}s to make inference \n\
              '.format(time.time() - t0))
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, fav_movie, n_recommendations):
        """
        make top n movie recommendations
        Parameters
        ----------
        fav_movie: str, name of user input movie
        n_recommendations: int, top n recommendations
        """
        # get data
        movie_user_mat_sparse, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(self.model, movie_user_mat_sparse,
                                         hashmap, fav_movie, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance '
                  'of {2}'.format(i + 1, reverse_hashmap[idx], dist))
Example #29
0
class SMOTE(UnbalancedDataset):
    """
    This object is an implementation of SMOTE - Synthetic Minority
    Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and
    SVM-SMOTE.

    See the original papers: [1], [2], [3] for more details.

    * It does not support multiple classes automatically, but can be called
    multiple times
    """

    def __init__(self,
                 k=5,
                 m=10,
                 out_step=0.5,
                 ratio=1,
                 random_state=None,
                 kind='regular',
                 verbose=False,
                 **kwargs):
        """
        SMOTE over sampling algorithm and variations. Choose one of the
        following options: 'regular', 'borderline1', 'borderline2', 'svm'

        :param k: Number of nearest neighbours to used to construct synthetic
                  samples.

        :param m: The number of nearest neighbours to use to determine if a
                  minority sample is in danger.

        :param out_step: Step size when extrapolating

        :param ratio: Fraction of the number of minority samples to
                      synthetically generate.

        :param random_state: Seed for random number generation

        :param kind: The type of smote algorithm to use one of the following
                     options: 'regular', 'borderline1', 'borderline2', 'svm'

        :param verbose: Whether or not to print status information

        :param kwargs: Additional arguments passed to sklearn SVC object
        """

        # Parent class methods
        UnbalancedDataset.__init__(self,
                                   ratio=ratio,
                                   random_state=random_state)

        # --- The type of smote
        # This object can perform regular smote over-sampling, borderline 1,
        # borderline 2 and svm smote. Since the algorithms are fairly simple
        # they share most methods.#
        self.kind = kind

        # --- Verbose
        # Control whether or not status and progress information should be#
        self.verbose = verbose

        # --- Nearest Neighbours for synthetic samples
        # The smote algorithm uses the k-th nearest neighbours of a minority
        # sample to generate new synthetic samples.#
        self.k = k

        # --- NN object
        # Import the NN object from scikit-learn library. Since in the smote
        # variations we must first find samples that are in danger, we
        # initialize the NN object differently depending on the method chosen#
        from sklearn.neighbors import NearestNeighbors

        if kind == 'regular':
            # Regular smote does not look for samples in danger, instead it
            # creates synthetic samples directly from the k-th nearest
            # neighbours with not filtering#
            self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1)
        else:
            # Borderline1, 2 and SVM variations of smote must first look for
            # samples that could be considered noise and samples that live
            # near the boundary between the classes. Therefore, before
            # creating synthetic samples from the k-th nns, it first look
            # for m nearest neighbors to decide whether or not a sample is
            # noise or near the boundary.#
            self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1)

            # --- Nearest Neighbours for noise and boundary (in danger)
            # Before creating synthetic samples we must first decide if
            # a given entry is noise or in danger. We use m nns in this step#
            self.m = m

        # --- SVM smote
        # Unlike the borderline variations, the SVM variation uses the support
        # vectors to decide which samples are in danger (near the boundary).
        # Additionally it also introduces extrapolation for samples that are
        # considered safe (far from boundary) and interpolation for samples
        # in danger (near the boundary). The level of extrapolation is
        # controled by the out_step.#
        if kind == 'svm':
            # As usual, use scikit-learn object#
            from sklearn.svm import SVC

            # Store extrapolation size#
            self.out_step = out_step

            # Store SVM object with any parameters#
            self.svm_ = SVC(**kwargs)

    def resample(self):
        """
        Main method of all children classes.

        :return: Over-sampled data set.
        """

        # Start by separating minority class features and target values.
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # If regular SMOTE is to be performed#
        if self.kind == 'regular':
            # Print if verbose is true#
            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.k, end="")

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.#
            self.nearest_neighbour_.fit(minx)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.#
            nns = self.nearest_neighbour_.kneighbors(minx,
                                                     return_distance=False)[:, 1:]

            # Print status if verbose is true#
            if self.verbose:
                ##
                print("done!")

                # Creating synthetic samples #
                print("Creating synthetic samples...", end="")

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            # FIX THIS SHIT!!!#
            sx, sy = self.make_samples(x=minx,
                                       nn_data=minx,
                                       y_type=self.minc,
                                       nn_num=nns,
                                       n_samples=int(self.ratio * len(miny)),
                                       step_size=1.0,
                                       random_state=self.rs,
                                       verbose=self.verbose)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            ret_x = concatenate((self.x, sx), axis=0)
            ret_y = concatenate((self.y, sy), axis=0)

            return ret_x, ret_y

        if (self.kind == 'borderline1') or (self.kind == 'borderline2'):

            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.m, end="")

            # Find the NNs for all samples in the data set.
            self.nearest_neighbour_.fit(self.x)

            if self.verbose:
                print("done!")

            # Boolean array with True for minority samples in danger
            danger_index = [self.in_danger(x, self.y, self.m, miny[0],
                            self.nearest_neighbour_) for x in minx]

            # Turn into numpy array#
            danger_index = asarray(danger_index)

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                ##
                if self.verbose:
                    print('There are no samples in danger. No borderline '
                          'synthetic samples created.')

                # All are safe, nothing to be done here.#
                return self.x, self.y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(minx)

            # nns...#
            nns = self.nearest_neighbour_.kneighbors(minx[danger_index],
                                                     return_distance=False)[:, 1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                sx, sy = self.make_samples(minx[danger_index],
                                           minx,
                                           miny[0],
                                           nns,
                                           int(self.ratio * len(miny)),
                                           random_state=self.rs,
                                           verbose=self.verbose)

                # Concatenate the newly generated samples to the original data set
                ret_x = concatenate((self.x, sx), axis=0)
                ret_y = concatenate((self.y, sy), axis=0)

                return ret_x, ret_y

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                np.random.seed(self.rs)

                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01#
                fractions = betavariate(alpha=10, beta=10)

                # Only minority
                sx1, sy1 = self.make_samples(minx[danger_index],
                                             minx,
                                             self.minc,
                                             nns,
                                             fractions * (int(self.ratio * len(miny)) + 1),
                                             step_size=1,
                                             random_state=self.rs,
                                             verbose=self.verbose)

                # Only majority with smaller step size
                sx2, sy2 = self.make_samples(minx[danger_index],
                                             self.x[self.y != self.minc],
                                             self.minc, nns,
                                             (1 - fractions) * int(self.ratio * len(miny)),
                                             step_size=0.5,
                                             random_state=self.rs,
                                             verbose=self.verbose)

                # Concatenate the newly generated samples to the original data set
                ret_x = np.concatenate((self.x, sx1, sx2), axis=0)
                ret_y = np.concatenate((self.y, sy1, sy2), axis=0)

                return ret_x, ret_y

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.#

            # Fit SVM to the full data#
            self.svm_.fit(self.x, self.y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm_.support_[self.y[self.svm_.support_] == self.minc]
            support_vector = self.x[support_index]

            # First, find the nn of all the samples to identify samples in danger
            # and noisy ones
            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.m, end="")

            # As usual, fit a nearest neighbour model to the data
            self.nearest_neighbour_.fit(self.x)

            if self.verbose:
                print("done!")

            # Now, get rid of noisy support vectors

            # Boolean array with True for noisy support vectors
            noise_bool = []
            for x in support_vector:
                noise_bool.append(self.is_noise(x, self.y, self.minc,
                                                self.nearest_neighbour_))

            # Turn into array#
            noise_bool = asarray(noise_bool)

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]

            # Find support_vectors there are in danger (interpolation) or not
            # (extrapolation)
            danger_bool = [self.in_danger(x,
                                          self.y,
                                          self.m,
                                          self.minc,
                                          self.nearest_neighbour_)
                           for x in support_vector]

            # Turn into array#
            danger_bool = asarray(danger_bool)

            # Something ...#
            safety_bool = np.logical_not(danger_bool)

            if self.verbose:
                print("Out of {0} support vectors, {1} are noisy, "
                      "{2} are in danger "
                      "and {3} are safe.".format(support_vector.shape[0],
                                                 noise_bool.sum().astype(int),
                                                 danger_bool.sum().astype(int),
                                                 safety_bool.sum().astype(int)
                                                 )
                      )

                # Proceed to find support vectors NNs among the minority class
                print("Finding the %i nearest neighbours..." % self.k, end="")

            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(minx)

            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            np.random.seed(self.rs)
            fractions = betavariate(alpha=10, beta=10)

            # Interpolate samples in danger
            nns = self.nearest_neighbour_.kneighbors(support_vector[danger_bool],
                                                     return_distance=False)[:, 1:]

            sx1, sy1 = self.make_samples(support_vector[danger_bool],
                                         minx,
                                         self.minc, nns,
                                         fractions * (int(self.ratio * len(minx)) + 1),
                                         step_size=1,
                                         random_state=self.rs,
                                         verbose=self.verbose)

            # Extrapolate safe samples
            nns = self.nearest_neighbour_.kneighbors(support_vector[safety_bool],
                                                     return_distance=False)[:, 1:]

            sx2, sy2 = self.make_samples(support_vector[safety_bool],
                                         minx,
                                         self.minc, nns,
                                         (1 - fractions) * int(self.ratio * len(minx)),
                                         step_size=-self.out_step,
                                         random_state=self.rs,
                                         verbose=self.verbose)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            ret_x = concatenate((self.x, sx1, sx2), axis=0)
            ret_y = concatenate((self.y, sy1, sy2), axis=0)

            return ret_x, ret_y
Example #30
0
    def resample(self):
        from sklearn.svm import SVC
        from sklearn.neighbors import NearestNeighbors

        svc = SVC()
        svc.set_params(**self.svm_args)

        # Fit SVM and find the support vectors
        svc.fit(self.x, self.y)
        support_index = svc.support_[self.y[svc.support_] == self.minc]
        support_vetor = self.x[support_index]

        # Start with the minority class
        minx = self.x[self.y == self.minc]

        # First, find the NN of all the samples to identify samples in danger and noisy ones
        print("Finding the %i nearest neighbours..." % self.m, end = "")
        NN = NearestNeighbors(n_neighbors = self.m + 1)
        NN.fit(self.x)
        print("done!")

        # Now, get rid of noisy support vectors

        # Boolean array with True for noisy support vectors
        noise_bool = asarray([is_noise(x, self.y, self.m, self.minc, NN) for x in support_vetor])

        # Remove noisy support vectors
        support_vetor = support_vetor[logical_not(noise_bool)]

        # Find support_vectors there are in danger (interpolation) or not (extrapolation)
        danger_bool = asarray([in_danger(x, self.y, self.m, self.minc, NN) for x in support_vetor])
        safety_bool = logical_not(danger_bool)


        print_stats = (len(support_vetor), nsum(noise_bool), nsum(danger_bool), nsum(safety_bool))
        print("Out of %i support vectors, %i are noisy, %i are in danger and %i are safe." % print_stats)

        # Proceed to find support vectors NNs among the minority class
        print("Finding the %i nearest neighbours..." % self.k, end = "")
        NN.set_params(**{'n_neighbors' : self.k + 1})
        NN.fit(minx)
        print("done!")


        print("Creating synthetic samples...", end = "")
        # Split the number of synthetic samples between interpolation and extrapolation
        Pyseed(self.rs)
        fractions = min(max(gauss(0.5, 0.1), 0), 1)

        # Interpolate samples in danger
        nns = NN.kneighbors(support_vetor[danger_bool], return_distance=False)[:, 1:]

        sx1, sy1 = make_samples(support_vetor[danger_bool], minx, self.minc, nns,\
                                fractions * (int(self.ratio * len(minx)) + 1),\
                                step_size=1,\
                                random_state=self.rs)

        # Extrapolate safe samples
        nns = NN.kneighbors(support_vetor[safety_bool], return_distance=False)[:, 1:]

        sx2, sy2 = make_samples(support_vetor[safety_bool], minx, self.minc, nns,\
                                (1 - fractions) * int(self.ratio * len(minx)),\
                                step_size=-self.out_step,\
                                random_state=self.rs)

        print("done!")

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx1, sx2), axis=0)
        ret_y = concatenate((self.y, sy1, sy2), axis=0)

        return ret_x, ret_y
Example #31
0
class KnnRecommender:
    def __init__(self, path_movies, path_ratings):
        self.path_movies = path_movies
        self.path_ratings = path_ratings
        self.movie_rating_thres = 0
        self.user_rating_thres = 0
        self.model = NearestNeighbors()

    def set_filter_params(self, movie_rating_thres, user_rating_thres):
        self.movie_rating_thres = movie_rating_thres
        self.user_rating_thres = user_rating_thres

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(
            **{
                'n_neighbors': n_neighbors,
                'algorithm': algorithm,
                'metric': metric,
                'n_jobs': n_jobs
            })

    def _prep_data(self):
        df_movies = pd.read_csv(os.path.join(self.path_movies),
                                usecols=['movieId', 'title'],
                                dtype={
                                    'movieId': 'int32',
                                    'title': 'str'
                                })
        df_ratings = pd.read_csv(os.path.join(self.path_ratings),
                                 usecols=['userId', 'movieId', 'rating'],
                                 dtype={
                                     'userId': 'int32',
                                     'movieId': 'int32',
                                     'rating': 'float32'
                                 })
        df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(),
                                     columns=['count'])
        popular_movies = list(
            set(
                df_movies_cnt.query(
                    'count >= @self.movie_rating_thres').index))  # noqa
        movies_filter = df_ratings.movieId.isin(popular_movies).values

        df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(),
                                    columns=['count'])
        active_users = list(
            set(df_users_cnt.query(
                'count >= @self.user_rating_thres').index))  # noqa
        users_filter = df_ratings.userId.isin(active_users).values
        df_ratings_filtered = df_ratings[movies_filter & users_filter]
        movie_user_mat = df_ratings_filtered.pivot(index='movieId',
                                                   columns='userId',
                                                   values='rating').fillna(0)
        print(movie_user_mat)
        hashmap = {
            movie: i
            for i, movie in enumerate(
                list(
                    df_movies.set_index('movieId').loc[
                        movie_user_mat.index].title))  # noqa
        }
        print("hash->", hashmap)
        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)
        del df_movies, df_movies_cnt, df_users_cnt
        del df_ratings, df_ratings_filtered, movie_user_mat
        gc.collect()
        return movie_user_mat_sparse, hashmap

    def _fuzzy_matching(self, hashmap, fav_movie):
        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap, fav_movie, n_recommendations):
        # fit
        model.fit(data)
        # get input movie index
        print('You have input movie:', fav_movie)
        idx = self._fuzzy_matching(hashmap, fav_movie)
        print("-------->", idx, fav_movie)
        # inference
        print('Recommendation system start to make inference')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(data[idx],
                                              n_neighbors=n_recommendations +
                                              1)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        print('It took my system {:.2f}s to make inference \n\
              '.format(time.time() - t0))
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, fav_movie, n_recommendations):
        # get data
        movie_user_mat_sparse, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(self.model, movie_user_mat_sparse,
                                         hashmap, fav_movie, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance '
                  'of {2}'.format(i + 1, reverse_hashmap[idx], dist))
class kNN:
    def __init__(self,
                 dataObj,
                 kNeighbors,
                 distMetric='minkowski',
                 p=2,
                 metric_params=None,
                 n_jobs=None):
        # Data object with training/testdata
        self.data = dataObj

        if kNeighbors == -1:
            self.useAllNeighbors = True
        else:
            self.useAllNeighbors = False
            kNeighbors = kNeighbors

        # Create kNN classifier
        self.nn = NearestNeighbors(n_neighbors=kNeighbors,
                                   metric=distMetric,
                                   p=p,
                                   metric_params=metric_params,
                                   n_jobs=n_jobs)

    def fit(self,
            featType,
            nSplits=3,
            randState=1010101,
            applyTransform=False,
            customData=[]):
        if featType == 'train':
            if customData == []:
                train, validation = self.data.getData('train')
                trainData = train[0]
            else:
                trainData = customData
            self.nn.fit(trainData)
            self.krank = self.nn.kneighbors(trainData)[1]
            return

        elif featType == 'test':
            query, gallery = self.data.getData('test')
        elif featType == 'validation':
            query, gallery = self.data.getData(featType)

        if self.useAllNeighbors:
            self.nn.set_params(n_neighbors=len(gallery[0]))

        if applyTransform:
            query[0] = matmul(query[0], self.U.T)
            gallery[0] = matmul(gallery[0], self.U.T)

        self.nn.fit(gallery[0])
        self.krank = self.nn.kneighbors(query[0])[1]
        # for query, gallery in self.data.getData('test'):
        #     self.nn.fit(gallery[0])
        #     # Save indices of k nearest neighbors
        #     self.krank.append(self.nn.kneighbors(array(query[0]).reshape(1, -1))[1])
        k = len(self.krank[0])
        for i in range(len(self.krank)):
            qLab = query[1][i]
            qCam = query[2][i]
            labs = gallery[1][self.krank[i]]
            cams = gallery[2][self.krank[i]]
            correctPos = where(
                logical_not(logical_and(labs == qLab, cams == qCam)))
            self.krank[i] = append(self.krank[i][correctPos],
                                   array([-1] * (k - len(correctPos[0]))))
        maxEmpty = max(sum(self.krank == -1, axis=1))
        self.krank = self.krank[:, 0:k - maxEmpty]

    # def fitModel(self, features, labels):
    #     # Fit model. Features consists of rows of features
    #     self.nn.fit(features, labels)

    def modParams(self,
                  kNeighbors,
                  distMetric,
                  p='minkowski',
                  metric_params=2,
                  n_jobs=None):
        self.nn.set_params(n_neighbors=kNeighbors,
                           metric=distMetric,
                           p=p,
                           metric_params=metric_params,
                           n_jobs=n_jobs)

    def setTransform(self, A, isPickle=False):
        if isPickle:
            with open(A, 'rb') as f:
                A = pickle.load(f)

        self.U = cholesky(A, lower=False)

    def setTransMat(self, A, isPickle=False):
        if isPickle:
            with open(A, 'rb') as f:
                A = pickle.load(f)
        self.transMat = A

    def calcScore(self, rank, plot=False):
        self.rankAccs = zeros(rank)

        for i in range(len(self.krank)):
            for j in range(rank):
                matches = self.data.labelsGallery[
                    self.krank[i][0:j + 1]] == self.data.labelsQuery[i]
                positiveMatches = sum(matches)
                if positiveMatches > 0:
                    self.rankAccs[j:rank] += 1
                    break

        self.rankAccs = self.rankAccs / len(self.krank)

        if plot:
            self.plotAccs(self.rankAccs)

        return self.rankAccs

    def calcMAP(self):
        self.rankMAp = 0
        for i in range(len(self.krank)):
            self.rankMAp += self.calcAP(self.data.labelsQuery[i],
                                        self.data.labelsGallery[self.krank[i]])
        self.rankMAp = self.rankMAp / len(self.krank)
        return self.rankMAp

    def plotAccs(self, rank):
        fig, axs = pyplot.subplots()
        axs.plot(rank)
        pyplot.show()

    def calcAP(self, trueLabel, neighbors):
        nNeighbors = len(neighbors)
        nMatches = sum(trueLabel == neighbors)

        precision = zeros(nNeighbors)
        recall = zeros(nNeighbors)

        if nMatches == 0:
            return 0

        recallInc = 1 / nMatches

        #trueLabelArr = array([trueLabel]*nNeighbors)

        nPoints = 11
        interp = zeros(nPoints)

        for i in range(len(precision)):
            precision[i] = mean(trueLabel == neighbors[0:i + 1])

            if i == 0:
                recall[i] = recallInc * (trueLabel == neighbors[i])
            else:
                recall[i] = recall[i - 1]
                if trueLabel == neighbors[i]:
                    recall[i] = recall[i] + recallInc

        #recall = matmul((trueLabel==neighbors)*recallInc, tril(ones((len(neighbors), len(neighbors)))))

        recall = around(recall, 10)
        for i in range(nPoints):
            idx = min(where(i * 0.1 <= recall)[0])

            interp[i] = max(precision[idx:len(precision)])

        return mean(interp)

    def kernelFit(self, featType, kernel, applyTransform=False):
        self.krank = []
        if featType == 'train':
            pass
        elif featType == 'test':
            query, gallery = self.data.getData('test')
            if applyTransform:
                query[0] = matmul(query[0], self.U.T)
                gallery[0] = matmul(gallery[0], self.U.T)
            for i in range(len(query[0])):

                dist = kernel.transform(query[0][i].reshape(1, -1), gallery[0])
                idx = argsort(dist, axis=0)
                #idx = argsort(dist, axis=0)[::-1]

                qLab = query[1][i]
                qCam = query[2][i]
                labs = gallery[1][idx]
                cams = gallery[2][idx]
                correctPos = where(
                    logical_not(logical_and(labs == qLab,
                                            cams == qCam)).flatten())
                self.krank.append(
                    append(
                        idx[correctPos],
                        array([-1] * (len(gallery[0]) - len(correctPos[0])))))

            self.krank = array(self.krank)
            maxEmpty = max(sum(self.krank == -1, axis=1))
            self.krank = self.krank[:, 0:len(gallery[0]) - maxEmpty]
Example #33
0
class KnnRecommender:
    """
    This is an item-based collaborative filtering recommender with
    KNN implmented by sklearn
    """
    def __init__(self, sparse_matrix_file, mapper_file):
        """
        Recommender requires path to data: movies data and ratings data
        Parameters
        ----------
        path_movies: str, movies data file path
        path_ratings: str, ratings data file path
        """
        self.sparse_matrix_file = sparse_matrix_file
        self.mapper_file = mapper_file
        self.model = NearestNeighbors()

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
        """
        set model params for sklearn.neighbors.NearestNeighbors
        Parameters
        ----------
        n_neighbors: int, optional (default = 5)
        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        metric: string or callable, default 'minkowski', or one of
            ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
        n_jobs: int or None, optional (default=None)
        """
        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(**{
            'n_neighbors': n_neighbors,
            'algorithm': algorithm,
            'metric': metric,
            'n_jobs': n_jobs})

    def _prep_data(self):     

        game_user_mat_sparse = scipy.sparse.load_npz(os.path.join(self.sparse_matrix_file))
        with open(os.path.join(self.mapper_file), 'rb') as fp:
            hashmap = pickle.load(fp)

        return game_user_mat_sparse, hashmap

    def _fuzzy_matching(self, hashmap, fav_movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None
        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        Return
        ------
        index of the closest match
        """
        match_tuple = []
        # get match
        for name, idx in hashmap.items():
            ratio = fuzz.ratio(name.lower(), game_name.lower())
            if ratio >= 60:
                match_tuple.append((name, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap,
                   game_name, n_recommendations):
        """
        return top n similar movie recommendations based on user's input movie
        Parameters
        ----------
        model: sklearn model, knn model
        data: movie-user matrix
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        n_recommendations: int, top n recommendations
        Return
        ------
        list of top n similar movie recommendations
        """
        # fit
        model.fit(data)
        # get input movie index
        print('You have input game:', game_name)
        idx = self._fuzzy_matching(hashmap, game_name)
        # inference
        print('Recommendation system start to make inference')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(
            data[idx],
            n_neighbors=n_recommendations+1)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        print('It took my system {:.2f}s to make inference \n\
              '.format(time.time() - t0))
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, game_name, n_recommendations):
        """
        make top n movie recommendations
        Parameters
        ----------
        fav_movie: str, name of user input movie
        n_recommendations: int, top n recommendations
        """
        # get data
        game_user_mat_sparse, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(
            self.model, game_user_mat_sparse, hashmap,
            game_name, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(game_name))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}'.format(i+1, reverse_hashmap[idx]))
Example #34
0
class recommender_knn:
    # Item-based CF recommender class with KNN implmented by sklearn
    def __init__(self,
                 path_test,
                 path_rat,
                 mov_rat_thres,
                 use_rat_thres,
                 n_nei,
                 alg,
                 met,
                 job=None):
        self.path_test = path_test  # path to movies data
        self.path_rat = path_rat  # path to ratings data
        self.mov_rat_thres = mov_rat_thres
        self.use_rat_thres = use_rat_thres
        self.k_nn = n_nei
        # get the NearestNeighbors model
        self.model = NearestNeighbors()
        # n_neighbors: int, # algorithm: brute, # metric: (default 'minkowski') = 'cosine', # n_jobs: int or None
        if job and (job > 1 or job == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(
            **{
                'n_neighbors': self.k_nn,
                'algorithm': alg,
                'metric': met,
                'n_jobs': job
            })

    def preprocess_data(self):
        # - set up movie-user matrix
        # read data from a .dat file and save as DataFrame - pandas
        df_test = pd.read_table(os.path.join(self.path_test),
                                delimiter=' ',
                                usecols=['userID', 'movieID'],
                                dtype={
                                    'userID': 'int32',
                                    'movieID': 'int32'
                                })

        df_rat = pd.read_table(os.path.join(self.path_rat),
                               delimiter=' ',
                               usecols=['userID', 'movieID', 'rating'],
                               dtype={
                                   'userID': 'int32',
                                   'movieID': 'int32',
                                   'rating': 'float32'
                               })

        # filter data
        df_mov_coun = pd.DataFrame(df_rat.groupby('movieID').size(),
                                   columns=['count'])
        # get a list of movies which received enough rating
        pop_mov = list(
            set(df_mov_coun.query(
                'count >= @self.mov_rat_thres').index))  # noqa
        mov_fil = df_rat.movieID.isin(pop_mov).values
        # print out all elements of a pandas DataFrame
        pd.set_option('display.max_columns', None)
        # print out all element of an array
        np.set_printoptions(threshold=np.inf)

        # just consider the users who rated more than the film number threshold
        df_use_coun = pd.DataFrame(df_rat.groupby('userID').size(),
                                   columns=['count'])
        act_use = list(
            set(df_use_coun.query('count >= @self.use_rat_thres').index))

        use_fil = df_rat.userID.isin(act_use).values
        # erase the unactive user and unpopular movies
        df_rat_fil = df_rat[mov_fil & use_fil]

        # create movie-user matrix by pivot function
        mov_use_mat = df_rat_fil.pivot(index='movieID',
                                       columns='userID',
                                       values='rating').fillna(0)

        # transform to a sparse matrix
        mov_use_spa_mat = csr_matrix(mov_use_mat.values)

        # clean up
        del df_rat, df_rat_fil
        del df_mov_coun, df_use_coun
        return mov_use_spa_mat, mov_use_mat, df_test

    def _rating(self, model, data, sparse_data, rating_movies, n_recom):
        # return top n similar movie recommendations based on user's input movie
        # model: sklearn model, knn model,  data: movie-user matrix, sparse_data = sparse matrix
        # guess_movies: list of movies need to be rated
        # n_recom: top n recommendations

        # Return: # list of top n similar movie recommendations
        # fit
        model.fit(sparse_data)
        # get input movie index
        print('You have a list of movies needed to be rated')
        t0 = time.time()
        print("The rating score for movies and user:")
        # run for all data in the test file
        score_list = []
        for i in range(len(rating_movies)):
            print(i)
            movieID = rating_movies.loc[i][1]
            userID = rating_movies.loc[i][0]
            # get the row ordered number as knowing the index
            idx = next(iter(np.where(data.index == movieID)[0]), 'not matched')
            if (idx == 'not matched'):
                score = 0.0
                score_list.append(score)
            else:
                # the first item is the point itself
                distances, indices = model.kneighbors(sparse_data[idx],
                                                      n_neighbors=self.k_nn +
                                                      1)

                # calculate the score for the item:
                count = 0
                mean_list = np.empty(
                    self.k_nn)  # rating mean of all neighbor items
                mean_rate_item = 0.0  # rating mean of the current rating item
                user_item_rate = np.empty(
                    self.k_nn)  # mean of the similarity x (the difference of )
                nomin = np.empty(self.k_nn)
                denom = np.empty(self.k_nn)

                # for loop to calculate the elements for the scoring formula
                for index in indices[0]:
                    if count > 0:
                        mean_list[count - 1] = sparse_data.mean(
                            axis=1)[index].squeeze().squeeze()
                        user_item_rate[count - 1] = data.loc[
                            data.iloc[[index]].index[0], userID]
                        # calculate the denominator
                        denom[count - 1] = distances.squeeze()[count]
                        # calculate the nominator item
                        nomin[count - 1] = distances.squeeze()[count] * (
                            user_item_rate[count - 1] - mean_list[count - 1])
                    else:
                        mean_rate_item = sparse_data.mean(
                            axis=1)[index].squeeze().squeeze()
                    count += 1

                # calculate the score
                score1 = nomin.sum() / denom.sum() + mean_rate_item[0, 0]
                if score1 < 0:
                    score_list.append(0)
                elif score1 > 5:
                    score_list.append(score1)
                else:
                    score_list.append(score1)

        print('It took {:.2f}s to make inference \n\
          '.format(time.time() - t0))
        return score_list

    def make_predictions(self):  # make n movie recommendations
        # guess_movie: list of movies which need to be rated, n_recom: n recommendations
        # get data
        mov_use_spa_mat, data, movies_list_rating = self.preprocess_data()
        # get recommendations
        score_list = self._rating(self.model, data, mov_use_spa_mat,
                                  movies_list_rating, self.k_nn)
        return score_list