Ejemplo n.º 1
0
Archivo: ibmf.py Proyecto: yw4509/lkpy
    def fit(self, ratings, **kwargs):
        timer = util.Stopwatch()

        users = pd.Index(np.unique(ratings['user']))
        items = pd.Index(np.unique(ratings['item']))

        u_no = users.get_indexer(ratings['user'])
        i_no = items.get_indexer(ratings['item'])
        mean = np.mean(ratings['rating'].values,
                       dtype='f4')  # TensorFlow is using 32-bits

        model = self._build_model(len(users), len(items), mean)

        _log.info('[%s] training model', timer)
        model.fit([u_no, i_no],
                  ratings['rating'],
                  epochs=self.epochs,
                  batch_size=self.batch_size)

        _log.info('[%s] model finished', timer)

        self.user_index_ = users
        self.item_index_ = items
        self.model = model

        return self
Ejemplo n.º 2
0
    def fit(self, ratings, **kwargs):
        """
        Train a model.

        The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other
        algorithm parameters.

        Args:
            ratings(pandas.DataFrame):
                (user,item,rating) data for computing item similarities.
        """
        util.check_env()
        # Training proceeds in 2 steps:
        # 1. Normalize item vectors to be mean-centered and unit-normalized
        # 2. Compute similarities with pairwise dot products
        self._timer = util.Stopwatch()

        _logger.debug('[%s] beginning fit, memory use %s', self._timer,
                      util.max_memory())
        _logger.debug('[%s] using CSR kernel %s', self._timer, csrk.name)

        init_rmat, users, items = sparse_ratings(ratings)
        n_items = len(items)
        _logger.info(
            '[%s] made sparse matrix for %d items (%d ratings from %d users)',
            self._timer, len(items), init_rmat.nnz, len(users))
        _logger.debug('[%s] made matrix, memory use %s', self._timer,
                      util.max_memory())

        rmat, item_means = self._mean_center(ratings, init_rmat, items)
        _logger.debug('[%s] centered, memory use %s', self._timer,
                      util.max_memory())

        rmat = self._normalize(rmat)
        _logger.debug('[%s] normalized, memory use %s', self._timer,
                      util.max_memory())

        _logger.info('[%s] computing similarity matrix', self._timer)
        smat = self._compute_similarities(rmat)
        _logger.debug('[%s] computed, memory use %s', self._timer,
                      util.max_memory())

        _logger.info('[%s] got neighborhoods for %d of %d items', self._timer,
                     np.sum(np.diff(smat.rowptrs) > 0), n_items)

        _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz)

        self.item_index_ = items
        self.item_means_ = item_means
        self.item_counts_ = np.diff(smat.rowptrs)
        self.sim_matrix_ = smat
        self.user_index_ = users
        self.rating_matrix_ = init_rmat
        # create an inverted similarity matrix for efficient scanning
        self._sim_inv_ = smat.transpose()
        _logger.info('[%s] transposed matrix for optimization', self._timer)
        _logger.debug('[%s] done, memory use %s', self._timer,
                      util.max_memory())

        return self
Ejemplo n.º 3
0
def main(args):
    mod_name = args.get('-m')
    input = args.get('--splits')
    output = args.get('-o')
    n_recs = int(args.get('-n'))
    model = args.get('ALGO')

    _log.info(f'importing from module {mod_name}')
    algorithms = importlib.import_module(mod_name)

    algo = getattr(algorithms, model)
    algo = Recommender.adapt(algo)

    path = Path(input)
    dest = Path(output)
    dest.mkdir(exist_ok=True, parents=True)

    ds_def = getattr(datasets, path.name, None)

    for file in path.glob("test-*"):
        test = pd.read_csv(file, sep=',')
        suffix = file.name[5:]
        train_file = path / f'train-{suffix}'
        timer = util.Stopwatch()

        if 'index' in test.columns:
            _log.info('setting test index')
            test = test.set_index('index')
        else:
            _log.warn('no index column found in %s', file.name)

        if train_file.exists():
            _log.info('[%s] loading training data from %s', timer, train_file)
            train = pd.read_csv(path / f'train-{suffix}', sep=',')
        elif ds_def is not None:
            _log.info('[%s] extracting training data from data set %s', timer,
                      path.name)
            train = datasets.ds_diff(ds_def.ratings, test)
            train.reset_index(drop=True, inplace=True)
        else:
            _log.error('could not find training data for %s', file.name)
            continue

        _log.info('[%s] Fitting the model', timer)
        # We train isolated to manage resource use
        model = batch.train_isolated(algo, train)
        try:
            _log.info('[%s] generating recommendations for unique users',
                      timer)
            users = test.user.unique()
            recs = batch.recommend(model, users, n_recs)
            _log.info('[%s] writing recommendations to %s', timer, dest)
            recs.to_csv(dest / f'recs-{suffix}', index=False)

            if isinstance(algo, Predictor) and not args['--no-predict']:
                _log.info('[%s] generating predictions for user-item', timer)
                preds = batch.predict(model, test)
                preds.to_csv(dest / f'pred-{suffix}', index=False)
        finally:
            model.close()
Ejemplo n.º 4
0
def test_stopwatch_minutes():
    w = lku.Stopwatch()
    w.stop()
    w.start_time = w.stop_time - 62
    s = str(w)
    p = re.compile(r'1m2.\d\ds')
    assert p.match(s)
Ejemplo n.º 5
0
def test_stopwatch_hours():
    w = lku.Stopwatch()
    w.stop()
    w.start_time = w.stop_time - 3663
    s = str(w)
    p = re.compile(r'1h1m3.\d\ds')
    assert p.match(s)
Ejemplo n.º 6
0
    def fit(self, data):
        """
        Run the optimization problem to learn W.

        Args:
            data (DataFrame): a data frame of ratings. Must have at least `user`,
                              `item`, and `rating` columns.

        Returns:
            SLIM: the fit slim algorithm object.
        """
        self._timer = util.Stopwatch()
        selector_data = data.copy(deep=True)
        self.selector.fit(selector_data)

        if self.binary:
            data = data.copy(deep=True)
            data['rating'] = 1

        rmat, uidx, iidx = sparse_ratings(data)

        # Optimize each item independently on different threads using joblib
        item_coeff_array_tuples = Parallel(n_jobs=self.nprocs)(
            delayed(self._fs_train_item)(item, iidx, data)
            for item in iidx.values)

        _logger.info('[%s] completed calculating coefficients for %s items',
                     self._timer, rmat.ncols)

        coeff_row = np.array([], dtype=np.int32)
        coeff_col = np.array([], dtype=np.int32)
        coeff_values = np.array([], dtype=np.float64)

        for itemid, i_pos, ncoeff_row, ncoeff_col, ncoeff_val in item_coeff_array_tuples:

            # Add coefficients with proper indexes for sparse matrix
            coeff_row = np.append(coeff_row, ncoeff_row)
            coeff_col = np.append(coeff_col, ncoeff_col)
            coeff_values = np.append(coeff_values, ncoeff_val)

        _logger.info('[%s] completed unpacking %s coefficients for %s items',
                     self._timer, len(coeff_values), rmat.ncols)
        coeff_row = np.require(coeff_row, dtype=np.int32)
        coeff_col = np.require(coeff_col, dtype=np.int32)
        coeff_values = np.require(coeff_values, dtype=np.float64)

        # Create sparse coefficient matrix
        self.coefficients_ = CSR.from_coo(coeff_row, coeff_col, coeff_values,
                                          (len(iidx), len(iidx))).to_scipy()

        self.user_index_ = uidx
        self.item_index_ = iidx
        self.ratings_matrix_ = rmat

        return self
Ejemplo n.º 7
0
    def fit(self, pruned_data):

        self.timer = util.Stopwatch()
        self.user_index = pruned_data.set_index('user')['item']
        self.user_data['count'] = pruned_data.groupby('user')['SO'].count()
        self.user_data.reset_index(inplace=True)
        self.item_data['count'] = pruned_data.groupby('item')['SO'].count()
        self.item_data.reset_index(inplace=True)
        user_prof = self.get_userfeature(pruned_data)
        item_prof = self.get_itemfeature(pruned_data)
        self.similarity_matrix = self.cosine_sim(user_prof, item_prof)
        #_logger.info('[%s] fitting LDA model', self.timer)

        return self
Ejemplo n.º 8
0
 def fit(self, pruned_data):
    
     self.timer = util.Stopwatch()
     self.review_data = pruned_data
     only_rev = pruned_data.dropna()
     
     item_rev = pd.DataFrame({'review': only_rev.groupby(['item']).review.apply(lambda x:' '.join(x))})
     item_rev.reset_index(inplace=True)
     
     item_rev['processed_reviews'] = item_rev['review'].apply(lambda row: self.process(row))
     self.item_data = item_rev
     
     tf_idf_mat = self.tf_idf(self.item_data, 'processed_reviews')
     self.similarity_matrix = self.cosine_sim(tf_idf_mat)
     
     
     _logger.info('[%s] fitting tfidf model', self.timer)
     
     return self
Ejemplo n.º 9
0
    def fit(self, ratings):
        """
        Train a model.

        The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other
        algorithm parameters.

        Args:
            ratings(pandas.DataFrame):
                (user,item,rating) data for computing item similarities.
        """
        # Training proceeds in 2 steps:
        # 1. Normalize item vectors to be mean-centered and unit-normalized
        # 2. Compute similarities with pairwise dot products
        self._timer = util.Stopwatch()

        init_rmat, users, items = matrix.sparse_ratings(ratings)
        n_items = len(items)
        _logger.info(
            '[%s] made sparse matrix for %d items (%d ratings from %d users)',
            self._timer, len(items), init_rmat.nnz, len(users))

        rmat, item_means = self._mean_center(ratings, init_rmat, items)

        rmat = self._normalize(rmat)

        _logger.info('[%s] computing similarity matrix', self._timer)
        smat = self._compute_similarities(rmat)

        _logger.info('[%s] got neighborhoods for %d of %d items', self._timer,
                     np.sum(np.diff(smat.rowptrs) > 0), n_items)

        _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz)

        self.item_index_ = items
        self.item_means_ = item_means
        self.item_counts_ = np.diff(smat.rowptrs)
        self.sim_matrix_ = smat
        self.user_index_ = users
        self.rating_matrix_ = init_rmat

        return self
Ejemplo n.º 10
0
    def fit(self, pruned_data):

        self.timer = util.Stopwatch()
        self.review_data = pruned_data
        only_rev = pruned_data.dropna()

        item_rev = pd.DataFrame({
            'review':
            only_rev.groupby(['item']).review.apply(lambda x: ' '.join(x))
        })
        item_rev.reset_index(inplace=True)

        #item_rev['processed_reviews'] = item_rev['review'].apply(lambda row: self.process(row))
        self.item_data = item_rev
        self.LDA_matrix = self.LDA(self.item_data, 'review')
        #self.LDA_matrix = self.LDA(self.item_data, 'processed_reviews')
        self.user_index = self.review_data.set_index('user')['item']
        self.item2index = pd.Index(self.item_data.item.unique(), name='item')
        _logger.info('[%s] fitting LDA model', self.timer)

        return self
Ejemplo n.º 11
0
    def fit(self, ratings, **kwargs):
        timer = util.Stopwatch()
        normed = self.bias.fit_transform(ratings, indexes=True)
        model = self._build_model(len(self.bias.user_offsets_),
                                  len(self.bias.item_offsets_))

        _log.info('[%s] training model', timer)
        model.fit([normed['uidx'], normed['iidx']], normed['rating'],
                  epochs=self.epochs, batch_size=self.batch_size)

        _log.info('[%s] model finished, extracting weights', timer)
        self.user_features_ = model.get_layer('user-embed').get_weights()[0]
        self.item_features_ = model.get_layer('item-embed').get_weights()[0]

        self.global_bias_ = self.bias.mean_
        self.user_bias_ = self.bias.user_offsets_.values
        self.item_bias_ = self.bias.item_offsets_.values
        self.user_index_ = self.bias.user_index
        self.item_index_ = self.bias.item_index

        return self
Ejemplo n.º 12
0
Archivo: bpr.py Proyecto: yw4509/lkpy
    def fit(self, ratings, **kwargs):
        timer = util.Stopwatch()
        rng = util.rng(self.rng_spec)

        matrix, users, items = sparse_ratings(ratings[['user', 'item']])

        _log.info('[%s] setting up model', timer)
        train, model = self._build_model(len(users), len(items))

        _log.info('[%s] preparing training dataset', timer)
        train_data = BprInputs(matrix, self.batch_size, self.neg_count, rng)

        _log.info('[%s] training model', timer)
        train.fit(train_data, epochs=self.epochs)

        _log.info('[%s] model finished', timer)

        self.user_index_ = users
        self.item_index_ = items
        self.model = model

        return self
Ejemplo n.º 13
0
    def fit(self, ratings):
        """
        Train a model.

        The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other
        algorithm parameters.

        Args:
            ratings(pandas.DataFrame):
                (user,item,rating) data for computing item similarities.
        """
        # Training proceeds in 2 steps:
        # 1. Normalize item vectors to be mean-centered and unit-normalized
        # 2. Compute similarities with pairwise dot products
        self._timer = util.Stopwatch()

        init_rmat, users, items = matrix.sparse_ratings(ratings)
        '''
        # Find User Rating to remove for experimenting with Unlearn Algorithm
        # Try to Find non trivial rating items to remove 
        for index, row in ratings.iterrows():
            if items.get_loc(row['item']) in [17,138,22,83,76,31,92]:
                #print(row['user'],row['item'],index,users.get_loc(row['user']),items.get_loc(row['item']))
                pass
        '''
        n_items = len(items)
        _logger.info(
            '[%s] made sparse matrix for %d items (%d ratings from %d users)',
            self._timer, len(items), init_rmat.nnz, len(users))

        start = time.time()
        rmat_scipy = init_rmat.to_scipy()

        self._compute_similarities_unlearn_min_centering_sparse_vectorize(
            rmat_scipy, items, users)
        end = time.time()
        learn_unlearn_time = end - start
        print("Unlearn Supported Learning: {}".format(end - start))

        rows, cols, vals = self.smat_unlearn_sparse_csr
        self.smat_unlearn_sparse = sps.csr_matrix((vals, (rows, cols)),
                                                  shape=(self.M, self.M))
        # Print OUT Similarity Matrix to Verify Completeness
        #print(self.smat_unlearn_sparse)

        start = time.time()
        self._unlearn_min_centering_sparse(54, 17, rmat_scipy,
                                           self.smat_unlearn_sparse)
        end = time.time()
        unlearn_time = end - start
        print("Unlearn: {}".format(end - start))

        start = time.time()
        rmat, item_means = self._mean_center(ratings, init_rmat, items, users)

        rmat = self._normalize(rmat)
        _logger.info('[%s] computing similarity matrix', self._timer)
        smat = self._compute_similarities(rmat, items, users)

        end = time.time()
        native_learn_time = end - start
        # Print OUT Similarity Matrix to Verify Completeness
        #print(smat.to_scipy())
        print("Native Learning: {}".format(end - start))

        _logger.info('[%s] got neighborhoods for %d of %d items', self._timer,
                     np.sum(np.diff(smat.rowptrs) > 0), n_items)

        _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz)

        self.item_index_ = items
        self.item_means_ = item_means
        self.item_counts_ = np.diff(smat.rowptrs)
        self.sim_matrix_ = smat
        self.user_index_ = users
        self.rating_matrix_ = init_rmat

        # Save the Time Cost evaluation result
        #f = open("output_matrix.csv","a+")
        #f.write("{},{},{},{}\n".format(init_rmat.nnz ,native_learn_time,learn_unlearn_time,unlearn_time))
        #f.close()
        return self
Ejemplo n.º 14
0
def test_stopwatch_instant():
    w = lku.Stopwatch()
    assert w.elapsed() > 0
Ejemplo n.º 15
0
def test_stopwatch_stop():
    w = lku.Stopwatch()
    time.sleep(0.5)
    w.stop()
    time.sleep(0.5)
    assert w.elapsed() >= 0.45
Ejemplo n.º 16
0
def test_stopwatch_str():
    w = lku.Stopwatch()
    time.sleep(0.5)
    s = str(w)
    assert s.endswith('ms')
Ejemplo n.º 17
0
def test_stopwatch_long_str():
    w = lku.Stopwatch()
    time.sleep(1.2)
    s = str(w)
    assert s.endswith('s')