Esempio n. 1
0
    def testImplicit(self):
        regularization = 1e-9
        tolerance = math.sqrt(regularization)
        tolerance = 0.001

        counts = csr_matrix([[1, 1, 0, 1, 0, 0],
                             [0, 1, 1, 1, 0, 0],
                             [1, 0, 1, 0, 0, 0],
                             [1, 1, 0, 0, 0, 0],
                             [0, 0, 1, 1, 0, 0],
                             [0, 1, 0, 0, 0, 1],
                             [0, 0, 0, 0, 1, 0]], dtype=np.float64)

        def check_solution(rows, cols, counts):
            reconstructed = rows.dot(cols.T)
            for i in range(counts.shape[0]):
                for j in range(counts.shape[1]):
                    self.assertTrue(abs(counts[i, j] - reconstructed[i, j]) <
                                    tolerance)

        # check cython version
        rows, cols = implicit.alternating_least_squares(counts * 2, 7,
                                                        regularization,
                                                        use_native=True)
        check_solution(rows, cols, counts.todense())

        # try out pure python version
        rows, cols = implicit.alternating_least_squares(counts, 7,
                                                        regularization,
                                                        use_native=False)
        check_solution(rows, cols, counts.todense())
Esempio n. 2
0
    def testImplicit(self):
        regularization = 1e-9
        tolerance = math.sqrt(regularization)
        tolerance = 0.001

        counts = csr_matrix(
            [[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0],
             [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0], [0, 1, 0, 0, 0, 1],
             [0, 0, 0, 0, 1, 0]],
            dtype=np.float64)

        def check_solution(rows, cols, counts):
            reconstructed = rows.dot(cols.T)
            for i in range(counts.shape[0]):
                for j in range(counts.shape[1]):
                    self.assertTrue(
                        abs(counts[i, j] - reconstructed[i, j]) < tolerance)

        # try all 8 variants of native/python, cg/cholesky, and
        # 64 vs 32 bit factors
        for dtype in (np.float32, np.float64):
            for use_cg in (True, False):
                for use_native in (True, False):
                    rows, cols = implicit.alternating_least_squares(
                        counts * 2,
                        7,
                        regularization,
                        use_native=use_native,
                        use_cg=use_cg,
                        dtype=dtype)
                    check_solution(rows, cols, counts.todense())
Esempio n. 3
0
 def alsRec(self):
     score = self.preProcessData()
     data ,living = self.matrixData(score)
     weighted = self.bm25_weight(living)
     print weighted.shape
     user1_factors, user2_factors = implicit.alternating_least_squares(weighted, factors=5) 
     print "save to redis"
     self.saveToRedis(data, user1_factors, user2_factors)
Esempio n. 4
0
 def fit(self, X, y=None):
     M = self.construct_sparse_matrix(X).tocoo()
     items = np.int32(M.col)
     self.popularity_ = self.count_popularity(items)
     M = M.toarray()
     M = M * self.alpha
     M = csr_matrix(M).astype('double')
     self.U, self.I = implicit.alternating_least_squares(M, factors=self.n_factors, regularization=self.regularization, iterations=self.n_epochs)
     return self
Esempio n. 5
0
 def recommend_boats(self):
     countrys_arr = np.array(self.countrys)
     boats_arr = np.array(self.boats)
     countrys_vecs, boats_vecs = implicit.alternating_least_squares(
         (self.boat_train * self.alpha).astype('double'),
         factors=20,
         regularization=0.1,
         iterations=50)
     return countrys_vecs, boats_vecs, boats_arr, countrys_arr
Esempio n. 6
0
 def recommend_destination(self):
     countrys_arr = np.array(
         self.countrys)  # Array of destination IDs from the ratings matrix
     distinations_arr = np.array(self.distinations)
     user_vecs, item_vecs = implicit.alternating_least_squares(
         (self.distination_train * self.alpha).astype('double'),
         factors=20,
         regularization=0.1,
         iterations=50)
     return user_vecs, item_vecs, distinations_arr, countrys_arr
Esempio n. 7
0
def get_aucs_vs_factors_als():
    factors = [8, 16, 32, 64, 128]

    aucs = []

    for factor in factors:
        subreddit_factors, user_factors = alternating_least_squares(
            bm25_weight(comments), factor)
        aucs.append(
            auc(test_set[:20000], user_factors, subreddit_factors, subreddits,
                users))

    return aucs
Esempio n. 8
0
 def fit(self, X, y=None):
     M = self.construct_sparse_matrix(X).tocoo()
     items = np.int32(M.col)
     self.popularity_ = self.count_popularity(items)
     M = M.toarray()
     M = M * self.alpha
     M = csr_matrix(M).astype('double')
     self.U, self.I = implicit.alternating_least_squares(
         M,
         factors=self.n_factors,
         regularization=self.regularization,
         iterations=self.n_epochs)
     return self
def calculate_similar_artists(input_filename, output_filename,
                              factors=50, regularization=0.01,
                              iterations=15,
                              exact=False, trees=20,
                              use_native=True,
                              dtype=numpy.float64):

    print("Calculating similar artists. This might take a while")
    print("reading data from %s", input_filename)
    start = time.time()
    df, transfers = read_data(input_filename)
    print("read data file in %s", time.time() - start)

    print("weighting matrix by bm25")
    weighted = bm25_weight(transfers)

    print("calculating factors")
    start = time.time()
    artist_factors, user_factors = alternating_least_squares(weighted,
                                                             factors=factors,
                                                             regularization=regularization,
                                                             iterations=iterations,
                                                             use_native=use_native,
                                                             dtype=dtype)
    print("calculated factors in %s", time.time() - start)

    # write out artists by popularity
    print("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    if exact:
        calc = TopRelated(artist_factors)
    else:
        calc = ApproximateTopRelated(artist_factors, trees)

    print("writing top related to %s", output_filename)
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in calc.get_related(artistid):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
                
                recommendedClients = RecommendedClients()
                recommendedClients.empresa = Empresa.objects.get(fiscal_id=artist)
                recommendedClients.clientes_recomendados = Empresa.objects.get(fiscal_id=artists[other])
                recommendedClients.similarity = score
                recommendedClients.save()
Esempio n. 10
0
def fit(model, train_target, out_dir):
    """
    Factorize the training matrix of playlist-song pairs. Return nothing.

    Parameters
    ----------
    model: model file
        Model specification.
    train_target: numpy array, shape (num playlists, num songs)
        Matrix of playlist-song co-occurrences at the train split.
    out_dir: string
        Path to the results directory
    """

    print('\nSetting up fit...')

    # identify dimensions
    num_playlists, num_songs = train_target.shape

    # initialize weights
    playlists = np.random.rand(num_playlists, model.num_factors) * 0.01
    songs = np.random.rand(num_songs, model.num_factors) * 0.01

    print('\nFitting...')

    for epoch in xrange(1, model.max_epochs + 1):

        # keep track of time
        start_time = time.time()

        playlists, songs = implicit.alternating_least_squares(
            Cui=model.positive_weight * train_target,
            factors=model.num_factors,
            X=playlists,
            Y=songs,
            iterations=1,
            regularization=model.L2_weight,
            use_cg=False)

        print('\tEpoch {} of {} took {:.3f}s'.format(epoch, model.max_epochs,
                                                     time.time() - start_time))

    # save the fit model
    print('\nSaving model weights...')

    params = (playlists, songs)
    params_file = '{}_params.pkl'.format(model.name)
    with open(os.path.join(out_dir, params_file), 'w') as f:
        cPickle.dump(params, f)
def als(trainSparse):
    """Training of the ALS algorithm
    Args:
        trainSparse: train sparse matrix

    Returns:
        user_vecs_arr: user matrix (users x latent_factors)
        item_vecs_arr: item matrix (items x latent_factors)
    
    """
    print("-> Training ALS algorithm ...")
    k = 130
    user_vecs_arr, item_vecs_arr = implicit.alternating_least_squares(
        trainSparse, factors=k, regularization=0.01, iterations=30)
    return (user_vecs_arr, item_vecs_arr)
Esempio n. 12
0
 def run_user(self, pid, k):
     """
     Return a set of recommendations for the user
     """
     user_vecs, item_vecs = implicit.alternating_least_squares(
         (self.inter_df * self.alpha).astype('double'),
         factors=64,
         regularization=0.1,
         iterations=50)
     rec_list = self.rec_items(pid,
                               self.inter_df,
                               user_vecs,
                               item_vecs,
                               num_items=k)
     return set(rec_list)
    def train(self, training_set, alpha, factors, regularization, iterations):
        """
        Using lib implicit python for recommend packages for that user
        :param training_set:    training data
        :param alpha:           linear_scaling factor alpha
        :param factors:         latent vectors for each user and item
        :param regularization:  parameter for avoiding overfit data
        :param iterations:      iteration of algorithm
        :return:                initialize instance variable training_set, user_vecs and item_vecs
        """

        # Training algorithm with lib implicit an initial instance variable user_vecs and item_vecs
        self.user_vecs, self.item_vecs = \
            implicit.alternating_least_squares((training_set*alpha).astype('double'), factors=factors,
                                               regularization=regularization, iterations=iterations)
Esempio n. 14
0
def calculate_similar_artists(input_filename,
                              output_filename,
                              factors=50,
                              regularization=0.01,
                              iterations=15,
                              exact=False,
                              trees=20,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar artists. This might take a while")
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    logging.debug("weighting matrix by bm25")
    weighted = bm25_weight(plays)

    logging.debug("calculating factors")
    start = time.time()
    artist_factors, user_factors = alternating_least_squares(
        weighted,
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        use_native=use_native,
        dtype=dtype,
        use_cg=cg)
    logging.debug("calculated factors in %s", time.time() - start)

    # write out artists by popularity
    logging.debug("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    if exact:
        calc = TopRelated(artist_factors)
    else:
        calc = ApproximateTopRelated(artist_factors, trees)

    logging.debug("writing top related to %s", output_filename)
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in calc.get_related(artistid):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
Esempio n. 15
0
def train_model(input_filename, output_filename,
                factors=50, regularization=0.01,
                iterations=15, use_native=True,
                cg=True):
    logging.debug("Reading data from %s", input_filename)
    start = time.time()
    df, plays = load_data(input_filename)
    logging.debug("Read data file in %s", time.time() - start)

    logging.debug("Weighting matrix by bm25")
    weighted, params = bm25_weight(plays)
    params["regularization"] = regularization

    logging.debug("Calculating factors")
    start = time.time()
    subr_factors, user_factors = alternating_least_squares(weighted,
                                                           factors=factors,
                                                           regularization=regularization,
                                                           iterations=iterations,
                                                           use_native=use_native,
                                                           dtype=np.float64,
                                                           use_cg=cg)
    logging.debug("Calculated factors in %s", time.time() - start)

    logging.debug("Writing model to disk")
    with open("params.pickle", "wb") as b:
        pickle.dump(params, b)

    subreddits = dict(enumerate(df['subreddit'].cat.categories))

    with open("dict.pickle", "wb") as d:
        pickle.dump(subreddits, d)

    with open("factors.pickle", "wb") as f:
        pickle.dump(subr_factors, f)

    model = TopRelated(subr_factors)
    # Print 10 most similar subreddits for each subreddit to evaluate the model
    with open(output_filename, "w") as out:
        for i, name in subreddits.items():
            related = model.get_related(i)
            for other, score in related:
                out.write("{}\t{}\t{}\n".format(name, subreddits[other], score))

    logging.debug("Training complete")
Esempio n. 16
0
def main():
    data = preprocess_data_rec_engine(status=False)

    item_table = data[0]
    p_sparse = data[1]
    customers = data[2]
    products = data[3]
    quantity = data[4]

    tti = split_data_mask(p_sparse, pct_test = 0.2)
    
    product_training_set = tti[0]
    product_test_set = tti[1]
    product_user_altered = tti[2]

    alpha = 15
    
    vecs = implicit.alternating_least_squares((product_training_set*alpha).astype('double'), 
                                                            factors=20, 
                                                            regularization = 0.1, 
                                                            iterations = 50)
    user_vecs = vecs[0]
    item_vecs = vecs[1]

    customers_arr  = np.array(customers) # Array of customer IDs from the ratings matrix
    products_arr = np.array(products) # Array of product IDs from the ratings matrix

    rf = rec_items(12346, product_training_set, user_vecs, item_vecs, customers_arr, products_arr, item_table, num_items = 10)

    print(get_items_purchased(12346, product_training_set, customers_arr, products_arr, item_table))
    print(rf)
    
    l = list_rec(rf)
    print(l)

    print(lookup_customer_id(4338))
    
    # df = pd.read_pickle('../data/final/df_final.pkl')
    # table_pickle_file = open('../data/final/df_customer_table.pkl', "rb")
    # customer_table = pickle.load(table_pickle_file)
    # table_pickle_file.close() 
    # search_customer(3, df, customer_table)

    print("Done")
Esempio n. 17
0
def calculate_similar_artists(input_filename, output_filename,
                              factors=50, regularization=0.01,
                              iterations=15,
                              exact=False, trees=20,
                              use_native=True):
    logging.debug("Calculating similar artists. This might take a while")
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    logging.debug("weighting matrix by bm25")
    weighted = bm25_weight(plays)

    logging.debug("calculating factors")
    start = time.time()
    artist_factors, user_factors = alternating_least_squares(weighted,
                                                             factors=factors,
                                                             regularization=regularization,
                                                             iterations=iterations,
                                                             use_native=use_native)
    logging.debug("calculated factors in %s", time.time() - start)

    # write out artists by popularity
    logging.debug("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    if exact:
        calc = TopRelated(artist_factors)
    else:
        calc = ApproximateTopRelated(artist_factors, trees)

    logging.debug("writing top related to %s", output_filename)
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in calc.get_related(artistid):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
Esempio n. 18
0
    def testALS(self):
        counts = csr_matrix(
            [[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0],
             [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 1], [0, 1, 0, 0, 0, 1],
             [0, 0, 0, 0, 1, 1]],
            dtype=np.float64)

        # try all 8 variants of native/python, cg/cholesky, and
        # 64 vs 32 bit factors
        for dtype in (np.float32, np.float64):
            for use_cg in (False, True):
                for use_native in (True, False):
                    try:
                        np.random.seed(23)

                        rows, cols = implicit.alternating_least_squares(
                            counts * 2,
                            6,
                            regularization=1e-10,
                            use_native=use_native,
                            use_cg=use_cg,
                            dtype=dtype)
                    except Exception as e:
                        self.fail(msg="failed to factorize matrix. Error=%s"
                                  " dtype=%s, cg=%s, native=%s" %
                                  (e, dtype, use_cg, use_native))

                    reconstructed = rows.dot(cols.T)
                    for i in range(counts.shape[0]):
                        for j in range(counts.shape[1]):
                            self.assertAlmostEqual(
                                counts[i, j],
                                reconstructed[i, j],
                                delta=0.0001,
                                msg="failed to reconstruct row=%s, col=%s,"
                                " value=%.5f, dtype=%s, cg=%s, native=%s" %
                                (i, j, reconstructed[i, j], dtype, use_cg,
                                 use_native))
Esempio n. 19
0
def calculate_similar_artists(input_filename, output_filename,
                              factors=50, regularization=0.01,
                              iterations=15,
                              exact=False, trees=20,
                              use_native=True,
                              dtype=numpy.float64):

    # Calculo de clientes recomendados ---

    print("Calculating similar clients. This might take a while")
    print("reading data from %s", input_filename)
    start = time.time()
    df, transfers = read_data(input_filename, inv=False)
    print("read data file in %s", time.time() - start)

    print("weighting matrix by bm25")
    weighted = bm25_weight(transfers)

    print("calculating factors")
    start = time.time()
    artist_factors, user_factors = alternating_least_squares(weighted,
                                                             factors=factors,
                                                             regularization=regularization,
                                                             iterations=iterations,
                                                             use_native=use_native,
                                                             dtype=dtype)
    print("calculated factors in %s", time.time() - start)

    # write out artists by popularity
    print("calculating top clients")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    if exact:
        calc = TopRelated(artist_factors)
    else:
        calc = ApproximateTopRelated(artist_factors, trees)
    list_of_recommended_clients = []
    print("writing top related to %s", output_filename)
    for i, artistid in enumerate(to_generate):
        print(i)
        artist = artists[artistid]
        for other, score in calc.get_related(artistid):
            if (artist!=artists[other]):
                recommendedClients = RecommendedClients()
                recommendedClients.empresa = Empresa.objects.get(fiscal_id=artist)
                recommendedClients.clientes_recomendados = Empresa.objects.get(fiscal_id=artists[other])
                recommendedClients.similarity = score
                list_of_recommended_clients.append(recommendedClients)
    
    print('All clients recommendations have been stored in a list, saving them to DB')
    RecommendedClients.objects.bulk_create(list_of_recommended_clients, batch_size=20000)

    # Calculo de proveedores recomendados ---

    print("Calculating similar providers. This might take a while")
    print("reading data from %s", input_filename)
    start = time.time()
    df, transfers = read_data(input_filename, inv=True)
    print("read data file in %s", time.time() - start)

    print("weighting matrix by bm25")
    weighted = bm25_weight(transfers)

    print("calculating factors")
    start = time.time()
    artist_factors, user_factors = alternating_least_squares(weighted,
                                                             factors=factors,
                                                             regularization=regularization,
                                                             iterations=iterations,
                                                             use_native=use_native,
                                                             dtype=dtype)
    print("calculated factors in %s", time.time() - start)

    # write out artists by popularity
    print("calculating top providers")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    if exact:
        calc = TopRelated(artist_factors)
    else:
        calc = ApproximateTopRelated(artist_factors, trees)

    list_of_recommended_providers = []
    print("writing top related to %s", output_filename)
    for i, artistid in enumerate(to_generate):
        print(i)
        artist = artists[artistid]
        for other, score in calc.get_related(artistid):
            if (artist!=artists[other]):
                list_of_recommended_providers.append(recommendedProviders)
    
    print('All providers recommendations have been stored in a list, saving them to DB')
    RecommendedProviders.objects.bulk_create(list_of_recommended_providers, batch_size=20000)
Esempio n. 20
0
def main():
    purchase_input = sys.argv[1]
    cold_start_input = sys.argv[2]
    K = int(sys.argv[3])
    if K > 10:
        print("ERROR: Please recommend <= 10 products")
        exit(1)

    # read guest start, item start
    cold_start = pd.read_csv(cold_start_input)

    # read purchase data
    df = pd.read_csv(purchase_input)
    df.columns = ['qty', 'item_id', 'guest_id', 'purchase_date']
    df = df[['guest_id', 'item_id', 'qty']]

    # drop missing value and negative qty
    df = df.dropna()
    df = df[df.qty > 0]
    df = df[df.guest_id.map(lambda x: x.isdigit())
            & df.item_id.map(lambda x: x.isdigit())]

    # merge two df
    df['guest_id'] = df['guest_id'].astype(int)
    df['item_id'] = df['item_id'].astype(int)
    all = df.append(cold_start)

    # construct utility matrix
    guests = list(np.sort(all.guest_id.unique()))
    items = list(np.sort(all.item_id.unique()))
    quantity = list(all.qty)
    rows = all.guest_id.astype('category', categories=guests).cat.codes
    cols = all.item_id.astype('category', categories=items).cat.codes
    ori_rows = df.guest_id.astype('category', categories=guests).cat.codes
    ori_cols = df.item_id.astype('category', categories=items).cat.codes
    utility_mat = sparse.csr_matrix((quantity, (rows, cols)),
                                    shape=(len(guests), len(items)))

    # check sparsity
    sparsity = 100 * (1 - 1.0 * len(all) /
                      (utility_mat.shape[0] * utility_mat.shape[1]))
    print(
        "Sparsity after content based initialization is: {}".format(sparsity))

    # split training and testing data
    train_set, test_index = train_test_split(utility_mat, 0.1)

    # run ALS for implicit feedback to generate hidden features
    alpha = 40
    guest_feature, item_feature = implicit.alternating_least_squares(
        (train_set * alpha).astype('double'),
        factors=10,
        regularization=0.1,
        iterations=50)

    # collect predicted values
    predict_matrix = guest_feature.dot(item_feature.T)

    # evaluate performance using average rank
    hidden_rank, all_rank = average_rank(predict_matrix, test_index, rows,
                                         cols)
    print('Expected percentile ranking on testing set:{}\n'
          'Expected percentile ranking on total set: {}'.format(
              hidden_rank, all_rank))

    # recommend top K items for each guest and output
    guest_array = np.array(guests)
    items_array = np.array(items)
    rcd_df = top_rcmd(predict_matrix,
                      guest_array,
                      items_array,
                      ori_rows,
                      ori_cols,
                      k=K)
    rcd_df.to_csv('recommendations.csv', index=False)
Esempio n. 21
0
def benchmark_implicit(matrix, factors, reg, iterations):
    start = time.time()
    alternating_least_squares(matrix, factors, reg, iterations)
    return time.time() - start
Esempio n. 22
0
def benchmark_implicit(matrix, factors, reg, iterations):
    start = time.time()
    alternating_least_squares(matrix, factors, reg, iterations)
    return time.time() - start
import implicit
from scipy.sparse import coo_matrix
import pandas as pd

dataFile = ".\\data\\ml-100k\\u.data"
data = pd.read_csv(dataFile,
                   sep="\t",
                   header=None,
                   usecols=[0, 1, 2],
                   names=["userId", "itemId", "rating"])

data["userId"] = data["userId"].astype("category")
data["itemId"] = data["itemId"].astype("category")
rating_matrix = coo_matrix(
    (data["rating"].astype(float), (data["itemId"].cat.codes.copy(),
                                    data["userId"].cat.codes.copy())))

user_factors, item_factors = implicit.alternating_least_squares(
    rating_matrix, factors=10, regularization=0.01)

print(user_factors[196])

user196 = item_factors.dot(user_factors[196])

import heapq
recommendations = heapq.nlargest(3, range(len(user196)), user196.take)

print(recommendations)
Esempio n. 24
0
comments = coo_matrix((data['comments'].astype(float),
                       (data['subreddit'].cat.codes, data['user'].cat.codes)))

#%% [markdown]
# ### Latent Semantic Analysis

#%%
# toggle this variable if you want to recalculate the als factors
read_als_factors_from_file = True

#%%
if read_als_factors_from_file:
    subreddit_factors = np.load('subreddit_factors_als.npy')
    user_factors = np.load('user_factors_als.npy')
else:
    subreddit_factors, user_factors = alternating_least_squares(
        bm25_weight(comments), 20)

#%%
subreddit_factors, user_factors = alternating_least_squares(
    bm25_weight(comments), 20)


#%%
class TopRelated(object):
    def __init__(self, subreddit_factors):
        norms = np.linalg.norm(subreddit_factors, axis=-1)
        self.factors = subreddit_factors / norms[:, np.newaxis]
        self.subreddits = data['subreddit'].cat.categories.array.to_numpy()

    def get_related(self, subreddit, N=10):
        subredditid = np.where(self.subreddits == subreddit)[0][0]
Esempio n. 25
0
             indices=array.indices,
             indptr=array.indptr,
             shape=array.shape)
    return


# ALS: Alternative Least Squares

alpha = 40
factors = 300
regularization = 0.01
iterations = 20

matr = sps.lil_matrix((len(users), urm.shape[1]))
min_max = MinMaxScaler()
user_vecs, item_vecs = impl.alternating_least_squares(
    (urm * alpha).astype('double'), factors, regularization, iterations)

l = len(users)
for u in range(l):
    # dot product of user vector with all item vectors
    rec_vector = user_vecs[u, :].dot(item_vecs.T)
    rec_vector[items_nact] = 0
    # scale recommendation vector rec_vector between 0 and 1
    rec = min_max.fit_transform(rec_vector.reshape(-1, 1))[:, 0]
    cols = np.argsort(rec)[::-1][:1000]
    matr[u, cols] = rec[cols]
    print(u)

save_sparse_csr('ALS4k', matr.tocsr())
print("done")
Esempio n. 26
0
    # Get the associated row indices
    cols = data['movieId'].astype(
        pd.CategoricalDtype(categories=products, ordered=True)).cat.codes

    train, test = train_test_split(rows.values, cols.values, quantity)
    del quantity, rows, cols
    train_sparse = sparse.csr_matrix((train[2], (train[0], train[1])),
                                     shape=(len(customers), len(products)))

print("IO done in %f" % io_time.interval)

alpha = 15
with Timer() as cython_als_t:
    user_vecs, item_vecs = implicit.alternating_least_squares(
        (train_sparse * alpha).astype('double'),
        factors=64,
        regularization=0.1,
        iterations=10,
        use_gpu=False)
print(f"Time spent in implicit: {cython_als_t.interval}")

evaluator = Evaluator(test[0], test[1], test[2], threshold=3.0)
baseline_model = BaselinePredictor(train[1], train[2])
baseline_fpr, baseline_tpr, baseline_roc = evaluator.roc(
    lambda user, item: baseline_model.pred(item))

fpr, tpr, roc = evaluator.roc(
    lambda user, item: np.sum(user_vecs[user, :] * item_vecs[item, :]))
print("AUC: %f" % roc)

plt.clf()
plt.plot(baseline_fpr, baseline_tpr, label='baseline')
Esempio n. 27
0
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

places_train1, places_test1, places_users_altered1 = make_train(visits_sparse1, pct_test = 0.2)
places_train2, places_test2, places_users_altered2 = make_train(visits_sparse2, pct_test = 0.2)
places_train3, places_test3, places_users_altered3 = make_train(visits_sparse3, pct_test = 0.2)
places_train4, places_test4, places_users_altered4 = make_train(visits_sparse4, pct_test = 0.2)





######################################

alpha = 40
user_vecs1, place_vecs1 = implicit.alternating_least_squares((places_train1*alpha).astype('double'), 
                                                          factors=100, 
                                                          regularization = 0.1, 
                                                         iterations = 80)


user_vecs2, place_vecs2 = implicit.alternating_least_squares((places_train2*alpha).astype('double'), 
                                                          factors=100, 
                                                          regularization = 0.1, 
                                                         iterations = 80)


user_vecs3, place_vecs3 = implicit.alternating_least_squares((places_train3*alpha).astype('double'), 
                                                          factors=100, 
                                                          regularization = 0.1, 
                                                         iterations = 80)

Esempio n. 28
0
reg_list = [.01, .1, 1, 10]
factor_list = [64, 128, 256]

# store outcomes
out_file = open("als_hyperparameters.txt", "a+")
out_file.write("alpha\treg\tfactors\trec_auc\tpop_auc\n")

# train test split
u_to_a_train, u_to_a_test, altered_users = mflib.make_train(a_u_matrix.T.tocsr(), pct_test=0.2)

for alpha_idx in range(len(alpha_list)):
    for reg_idx in range(len(reg_list)):
        for factor_idx in range(len(factor_list)):
            print(alpha_idx, reg_idx, factor_idx)

            # split original matrix into user matrix and artist matrix through ALS
            user_vecs, artists_vecs = implicit.alternating_least_squares(
                (u_to_a_train * alpha_list[alpha_idx]).astype('double'),
                factors=factor_list[factor_idx],
                regularization=reg_list[reg_idx],
                iterations=50,
                use_gpu=True)

            rec_auc, pop_auc = mflib.calc_mean_auc(u_to_a_train, altered_users,
                                             [sparse.csr_matrix(user_vecs), sparse.csr_matrix(artists_vecs.T)],
                                             u_to_a_test)

            out_file.write(str(alpha_list[alpha_idx]) + "\t" + str(reg_list[reg_idx]) + "\t" + str(factor_list[factor_idx]) + "\t" + str(rec_auc) + "\t" + str(pop_auc) + "\n")

out_file.close()
Esempio n. 29
0
	def get_alternate_least_squares(self):
		alpha = 15
		return implicit.alternating_least_squares((self.product_train*alpha).astype('double'),
		                                                           factors = 20,
		                                                           regularization = 0.1,
		                                                           iterations = 50)
with open('Y_training_pid_trackid_new_rating_csr.pkl', 'rb') as f:
    Y_training_pid_track_id_rating_sparse_csr = pickle.load(f)

with open('Y_challenge_1_5_10_25_100track_pidnew_trackid_rating_csr.pkl',
          'rb') as f:
    Y_challenge_track_pidnew_rating_sparse_csr = pickle.load(f)

Y_training_pid_track_id_rating_sparse_csr = Y_training_pid_track_id_rating_sparse_csr.T
Y_training = sparse.vstack([
    Y_challenge_track_pidnew_rating_sparse_csr,
    Y_training_pid_track_id_rating_sparse_csr
], 'csr')

W, X = implicit.alternating_least_squares((Y_training * 50).astype('double'),
                                          factors=400,
                                          regularization=0.01,
                                          iterations=50,
                                          use_gpu=False)


def rec_items(pid):
    pref_vec = Y_training[pid].toarray()
    pref_vec = pref_vec.reshape(-1) + 1
    pref_vec[pref_vec > 1] = 0
    rec_vector = W[pid].dot(X.T)
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:, 0]
    recommend_vector = pref_vec * rec_vector_scaled
    idx = np.argsort(recommend_vector)[::-1][:500]
    recommendation = pd.DataFrame(idx)
    return recommendation.T
Esempio n. 31
0
def calculate_similar_artists(input_filename,
                              output_filename,
                              model="als",
                              factors=50,
                              regularization=0.01,
                              iterations=15,
                              exact=False,
                              trees=20,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    logging.debug("Calculating similar artists. This might take a while")
    logging.debug("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    logging.debug("read data file in %s", time.time() - start)

    # write out artists by popularity
    logging.debug("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    start = time.time()

    if model == "als":
        logging.debug("weighting matrix by bm25")
        weighted = bm25_weight(plays, K1=100, B=0.8)

        logging.debug("calculating factors")
        artist_factors, user_factors = alternating_least_squares(
            weighted,
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            use_native=use_native,
            dtype=dtype,
            use_cg=cg)
        logging.debug("calculated factors in %s", time.time() - start)

        if exact:
            calc = TopRelated(artist_factors)
        else:
            calc = ApproximateTopRelated(artist_factors, trees)
        logging.debug("writing top related to %s", output_filename)
        with open(output_filename, "w") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in calc.get_related(artistid):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))

    elif model in ("bm25", "tfidf", "cosine", "smoothed_cosine", "ochiai",
                   "overlap"):
        if model == "bm25":
            scorer = BM25Recommender(K1=100, B=0.5)

        elif model == "tfidf":
            scorer = TFIDFRecommender()

        elif model == "cosine":
            scorer = CosineRecommender()

        else:
            raise NotImplementedError("TODO: model %s" % model)

        logging.debug("calculating similar items")
        start = time.time()
        scorer.fit(plays, K=11)
        logging.debug("calculated all_pairs_knn in %s", time.time() - start)

        with open(output_filename, "w") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in scorer.similar_items(artistid):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
            Y[i] = spsolve(xTx + xTCiIX + lambda_eye, xTCiPi)
            # Solve for Yi = ((xTx + xT(Cu-I)X) + lambda*I)^-1)xTCiPi, equation 5 from the paper
    # End iterations
    return X, Y.T


user_vecs, item_vecs = implicit_weighted_ALS(product_train,
                                             lambda_val=0.1,
                                             alpha=15,
                                             iterations=1,
                                             rank_size=20)
user_vecs[0, :].dot(item_vecs).toarray()[0, :5]
alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares(
    (product_train * alpha).astype('double'),
    factors=20,
    regularization=0.1,
    iterations=50)


def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)


def calc_mean_auc(training_set, altered_users, predictions, test_set):
    store_auc = [
    ]  # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = []  # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis=0)).reshape(
        -1)  # Get sum of item iteractions to find most popular
dataset = pd.read_csv('./data/dataset_users_match.csv', sep=',', index_col=0)
interact = pd.read_csv('./data/inter_matr.csv', sep=',', index_col=0)
colnames = list(interact.columns.values)
# for i in range(len(colnames)):
#     colnames[i] = colnames[i]+str('_chosen')
# interact.columns = colnames

interact_sparse = sparse.csr_matrix(interact)

users_chosen_train, users_chosen_test, users_users_altered = make_train(
    interact_sparse, pct_test=0.2)

alpha = 15
user_vecs, users_chosen_vecs = implicit.alternating_least_squares(
    (users_chosen_train * alpha).astype('double'),
    factors=20,
    regularization=0.1,
    iterations=200)

user_vecs = pd.DataFrame(user_vecs)
user_vecs.index = list(interact.index.values)
users_chosen_vecs = pd.DataFrame(users_chosen_vecs)
users_chosen_vecs.index = list(interact.columns.values)
"""Make recomendations"""
user_id = 141
position = list(interact.index.values).index(user_id)
num_items = 10
pref_vec = users_chosen_train[position, :].toarray(
)  # Get the ratings from the training set ratings matrix
pref_vec = pref_vec.reshape(
    -1
Esempio n. 34
0
def recommender(customer_id, status):
    # Start time
    start = time.time()
    if status:
        printGreen('✔ RetailBox started..\t\t{0:.1f}s'.format(time.time() -
                                                              start))
    start = time.time()

    # Validate User Input
    validate_customer_id(customer_id)

    # Load Dataframe and create item_table, purchase matrix, etc.
    data = preprocess_data_rec_engine(status=True)

    item_table = data[0]
    purchase_sparse_matrix = data[1]
    customers = data[2]
    products = data[3]
    quantity = data[4]

    if status:
        printGreen('✔ Processed Data..\t\t{0:.1f}s'.format(time.time() -
                                                           start))
    start = time.time()

    # Split Data (Training/Test Split)
    training_test_split_data = split_data_mask(purchase_sparse_matrix,
                                               pct_test=0.2)

    product_training_set = training_test_split_data[0]
    product_test_set = training_test_split_data[1]
    product_user_altered = training_test_split_data[2]

    if status:
        printGreen(
            '✔ Split Data into Training and Test Sets..\t\t{0:.1f}s'.format(
                time.time() - start))
    start = time.time()

    # Train Recommendation Engine on given algorithm
    alpha = 15
    recommender_vecs = implicit.alternating_least_squares(
        (product_training_set * alpha).astype('double'),
        factors=20,
        regularization=0.1,
        iterations=50)

    user_vecs = recommender_vecs[0]
    item_vecs = recommender_vecs[1]

    customers_arr = np.array(customers)
    products_arr = np.array(products)

    if status:
        printGreen('✔ Recommender System Training Done..\t\t{0:.1f}s'.format(
            time.time() - start))
    start = time.time()

    # Lookup customer id
    cid = lookup_customer_id(customer_id)

    # Generate Recommendations for Customer
    rec_output = rec_items(cid, product_training_set, user_vecs, item_vecs,
                           customers_arr, products_arr, item_table)

    # Display Customer
    df = pd.read_pickle('../data/final/df_final.pkl')
    table_pickle_file = open('../data/final/df_customer_table.pkl', "rb")
    customer_table = pickle.load(table_pickle_file)
    table_pickle_file.close()
    search_customer(customer_id, df, customer_table)

    # Display Item Recommendations
    recommended_items_list = list_rec(rec_output)
    display_recommender_items(recommended_items_list)