def als_model(self, dataset):
     return WALSModel(
         dataset.n_students,
         dataset.n_courses,
         self.num_factors,
         regularization=self.regularization,
         unobserved_weight=0)
Esempio n. 2
0
def run_wals(data,
             dim,
             reg,
             unobs,
             weights=False,
             wt_type=LINEAR_RATINGS,
             feature_wt_exp=None,
             obs_wt=LINEAR_OBS_W):
    """Create the WALSModel and input, row and col factor tensors.

  Inputs:
    data:           scipy coo_matrix of item ratings
    dim:            number of latent factors
    reg:            regularization constant
    unobs:          unobserved item weight
    weights:        True: set obs weights, False: obs weights = unobs weights
    wt_type:        feature weight type: linear (0) or log (1)
    feature_wt_exp: feature weight exponent constant
    obs_wt:         feature weight linear factor constant

  Outputs:
    input_tensor:   tensor holding the input ratings matrix
    row_factor:     tensor for row_factor
    col_factor:     tensor for col_factor
    model:          WALSModel instance
  """
    row_wts = None
    col_wts = None

    num_rows = data.shape[0]
    num_cols = data.shape[1]

    if weights:
        assert feature_wt_exp is not None
        row_wts = np.ones(num_rows)
        col_wts = make_wts(data, wt_type, obs_wt, feature_wt_exp, 0)

    row_factor = None
    col_factor = None

    with tf.Graph().as_default():

        input_tensor = tf.SparseTensor(indices=zip(data.row, data.col),
                                       values=(data.data).astype(np.float32),
                                       dense_shape=data.shape)

        model = WALSModel(num_rows,
                          num_cols,
                          dim,
                          unobserved_weight=unobs,
                          regularization=reg,
                          row_weights=row_wts,
                          col_weights=col_wts)

        # retrieve the row and column factors
        row_factor = model.row_factors[0]
        col_factor = model.col_factors[0]

    return input_tensor, row_factor, col_factor, model
def get_model(data, ncomponents=10, unobserved_weight=0, regularization=0.05):
    nrows, ncols = data.shape
    r_weight = np.ones(nrows)
    c_weight = np.ones(ncols)

    with tf.Graph().as_default():
        tensor = tf.SparseTensor(np.column_stack((data.row, data.col)),
                                 (data.data).astype(np.float32), data.shape)
        model = WALSModel(nrows,
                          ncols,
                          ncomponents,
                          unobserved_weight,
                          regularization,
                          row_weights=r_weight,
                          col_weights=c_weight)
    return tensor, model.row_factors[0], model.col_factors[0], model
Esempio n. 4
0
    def _build_model(self):
        """
        构建wALS算法计算图
        :return:
        """

        num_rows = self.data.shape[0]
        num_cols = self.data.shape[1]

        # Weight矩阵初始化方式
        # 1.User orientation 同一个User下Miss Value平均
        # 2.Item orientation 同一个Item下Miss Value平均
        if self.weights:
            if self.weight_type == 'user':
                self.row_wts = np.ones(num_rows)
                self.col_wts = self._make_wts(self.data, self.wt_type,
                                              self.obs_wt, self.feature_wt_exp,
                                              0)
            elif self.weight_type == 'item':
                self.col_wts = np.ones(num_cols)
                self.row_wts = self._make_wts(self.data, self.wt_type,
                                              self.obs_wt, self.feature_wt_exp,
                                              1)

        with tf.Graph().as_default():
            self.input_tensor = tf.SparseTensor(
                indices=list(zip(self.data.row, self.data.col)),
                values=(self.data.data).astype(np.float32),
                dense_shape=self.data.shape)
            self.model = WALSModel(num_rows,
                                   num_cols,
                                   self.dim,
                                   unobserved_weight=self.unobs,
                                   regularization=self.reg,
                                   row_weights=self.row_wts,
                                   col_weights=self.col_wts)

            self.row_factor = self.model.row_factors[0]
            self.col_factor = self.model.col_factors[0]
Esempio n. 5
0
def train_model(train_sparse,
                test_sparse,
                num_users,
                num_movies,
                args,
                verbose=False):
    tf.logging.info('Train Start: {:%Y-%m-%d %H:%M:%S}'.format(
        datetime.datetime.now()))

    with tf.Graph().as_default(), tf.Session() as sess:

        row_weights = np.ones(num_users)
        col_weights = np.ones(num_movies)

        if args.col_weight_bool:
            col_weights = make_weights(train_sparse,
                                       args.col_weight_factor,
                                       axis=0)

        if args.row_weight_bool:
            row_weights = make_weights(train_sparse,
                                       args.row_weight_factor,
                                       axis=1)

        # create model
        model = WALSModel(num_users,
                          num_movies,
                          args.num_factors,
                          regularization=args.regularization,
                          unobserved_weight=args.unobserved_weight,
                          row_weights=row_weights,
                          col_weights=col_weights)

        # create sparse tensor

        input_tensor = tf.SparseTensor(
            indices=zip(train_sparse.row, train_sparse.col),
            values=(train_sparse.data).astype(np.float32),
            dense_shape=train_sparse.shape)

        test_tensor = tf.SparseTensor(
            indices=zip(test_sparse.row, test_sparse.col),
            values=(test_sparse.data).astype(np.float32),
            dense_shape=test_sparse.shape)

        # train model

        rmse_op = rmse(model, input_tensor) if verbose else None
        rmse_test_op = rmse(model, test_tensor)

        row_update_op = model.update_row_factors(sp_input=input_tensor)[1]
        col_update_op = model.update_col_factors(sp_input=input_tensor)[1]

        model.initialize_op.run()
        model.worker_init.run()
        for _ in range(args.epochs):
            # Update Users
            model.row_update_prep_gramian_op.run()
            model.initialize_row_update_op.run()
            row_update_op.run()
            # Update Items
            model.col_update_prep_gramian_op.run()
            model.initialize_col_update_op.run()
            col_update_op.run()

            if verbose:
                train_metric = rmse_op.eval()
                test_metric = rmse_test_op.eval()
                tf.logging.info('RMSE Train: {:,.3f}'.format(train_metric))
                tf.logging.info('RMSE Test:  {:,.3f}'.format(test_metric))
                # TODO Collect these in variable for graphing later

        row_factor = model.row_factors[0].eval()
        col_factor = model.col_factors[0].eval()

    tf.logging.info('Train Finish: {:%Y-%m-%d %H:%M:%S}'.format(
        datetime.datetime.now()))

    return row_factor, col_factor
n_rows = len(users_from_idx)
n_cols = len(items_from_idx)
shape = (n_rows, n_cols)

P = tf.SparseTensor(indices, values, shape)

print(P)
print('Total values: {:,}'.format(n_rows * n_cols))

from tensorflow.contrib.factorization import WALSModel

k = 10
n = 10
reg = 1e-1

model = WALSModel(n_rows, n_cols, k, regularization=reg, unobserved_weight=0)

row_factors = tf.nn.embedding_lookup(params=model.row_factors,
                                     ids=tf.range(model._input_rows),
                                     partition_strategy="div")
col_factors = tf.nn.embedding_lookup(params=model.col_factors,
                                     ids=tf.range(model._input_cols),
                                     partition_strategy="div")

row_indices, col_indices = tf.split(P.indices, axis=1, num_or_size_splits=2)
gathered_row_factors = tf.gather(row_factors, row_indices)
gathered_col_factors = tf.gather(col_factors, col_indices)
approx_vals = tf.squeeze(
    tf.matmul(gathered_row_factors, gathered_col_factors, adjoint_b=True))
P_approx = tf.SparseTensor(indices=P.indices,
                           values=approx_vals,
Esempio n. 7
0
    with open(data_path, 'r') as f:
        data = json.load(f)

    indices = []
    values = []

    for idx, elem in enumerate(data):
        indices += zip([idx] * len(elem), elem)
        values += [1.0] * len(elem)
    with tf.Graph().as_default() as graph1:
        sp_mat = tf.SparseTensor(indices, values, [num_rows, num_cols])

        model = WALSModel(num_rows,
                          num_cols,
                          dimension,
                          0.5,
                          2.0,
                          row_weights=None,
                          col_weights=None)

        row_factors = model.row_factors[0]
        col_factors = model.col_factors[0]

        sess = tf.Session(graph=graph1)

        writer = tf.summary.FileWriter('walsmodels', graph1)

        row_update_op = model.update_row_factors(sp_mat)[1]
        col_update_op = model.update_col_factors(sp_mat)[1]

        sess.run(model.initialize_op)
Esempio n. 8
0
 def als_model(self, dataset):
     return WALSModel(len(dataset["visitorid"].unique()),
                      len(dataset["itemid"].unique()),
                      self.num_factors,
                      regularization=self.regularization,
                      unobserved_weight=0)
Esempio n. 9
0
def reco(sess, inp, code, label, epsilon, train_dataset, dev_dataset, lr,
         weights_path):

    # Initialize hyperparameters
    # TODO: Proper tuning_threshold strategy, or is there a better stopping condition?
    # TODO: Grid search for reg_l2 tuning?  Currently only tune factor_dim
    factor_dim = 0
    reg_l2 = 0.1
    factor_loss_thresh = 1e-6
    tuning_thresh = 1e-6

    # Ratings matrix dimensions
    n_items = _train_utils.dataset_iter_len(
        sess,
        train_dataset.make_one_shot_iterator().get_next())
    n_users_train = 877
    n_users_dev = 110
    n_users_test = 110
    '''Placeholder labels
    label = np.random.randn(n_users_train + n_users_dev + n_users_test, 1)
    label = tf.convert_to_tensor(label, dtype=tf.float32)
    '''

    label_train = label[1:n_users_train + 1, -1]
    label_dev = label[n_users_train + 1:n_users_train + 1 + n_users_dev + 1,
                      -1]
    label_test = label[n_users_train + 1 + n_users_dev + 1:-1, -1]

    # Rating matrix
    # TODO: Random placeholder data for now.  Rating matrix must include all train/dev/test
    #       data.  Each row represents a user, and each column represents a feature. The label
    #       is to be included in the last feature column, with dev/test set labels removed.
    rating_matrix = np.random.randn(n_users_train + n_users_dev + n_users_test,
                                    n_items)

    input_tensor = tf.convert_to_tensor(rating_matrix, dtype=tf.float32)
    input_tensor = tf.contrib.layers.dense_to_sparse(input_tensor)

    # Tune model using increasing latent factor matrix dimension
    losscrit = np.inf
    while losscrit > tuning_thresh:

        factor_dim += 1

        # Weighted alternating least squares model (causes deprecation warning)
        model = WALSModel(n_users_train + n_users_dev + n_users_test,
                          n_items,
                          factor_dim,
                          regularization=reg_l2,
                          row_weights=None,
                          col_weights=None)

        # Retrieve row and column factors
        users_factor = model.row_factors[0]
        items_factor = model.col_factors[0]

        # Initialize training
        row_update_op = model.update_row_factors(sp_input=input_tensor)[1]
        col_update_op = model.update_col_factors(sp_input=input_tensor)[1]
        sess.run(model.initialize_op)
        sess.run(model.worker_init)

        # Update latent factor matrices via Alternating Least Squares until matrix decomposition converges
        u_factor_old = users_factor.eval(session=sess)
        i_factor_old = items_factor.eval(session=sess)
        factor_loss = np.inf
        while factor_loss > factor_loss_thresh:
            sess.run(model.row_update_prep_gramian_op)
            sess.run(model.initialize_row_update_op)
            sess.run(row_update_op)
            sess.run(model.col_update_prep_gramian_op)
            sess.run(model.initialize_col_update_op)
            sess.run(col_update_op)

            u_factor_new = users_factor.eval(session=sess)
            i_factor_new = items_factor.eval(session=sess)
            factor_loss = max(np.linalg.norm(u_factor_new - u_factor_old),
                              np.linalg.norm(i_factor_new - i_factor_old))

            u_factor_old = u_factor_new
            i_factor_old = i_factor_new

        # Predictions
        pred_fun = tf.matmul(users_factor, items_factor, transpose_b=True)
        pred = sess.run(pred_fun)
        pred_train = pred[1:n_users_train + 1, -1]
        pred_dev = pred[n_users_train + 1:n_users_train + 1 + n_users_dev + 1,
                        -1]
        pred_test = pred[n_users_train + 1 + n_users_dev + 1:-1, -1]

        # Performance
        loss_fun = tf.math.reduce_sum(tf.math.square(
            tf.abs(pred - label))) + tf.nn.l2_loss(
                users_factor) + tf.nn.l2_loss(items_factor)
        losscrit = sess.run(loss_fun)
        train_loss = sess.run(tf.reduce_mean(tf.abs(pred_train - label_train)))
        dev_loss = sess.run(tf.reduce_mean(tf.abs(pred_dev - label_dev)))
        test_loss = sess.run(tf.reduce_mean(tf.abs(pred_test - label_test)))
Esempio n. 10
0
def wals(id,
         from_date,
         to_date,
         predict_moment,
         dimension=30,
         weight=0.5,
         coef=2.0,
         n_iter=30):

    data_path = 'wp_' + from_date + '_' + to_date + '_sparse.json'

    deal_dict = np.load('dict_' + from_date + '_' + to_date +
                        '_for_sparse.npy')
    user_dict = np.load('user_' + from_date + '_' + to_date + '.npy')

    if id not in user_dict:
        return -1
    else:
        user_index = np.where(user_dict == id)[0][0]

    num_rows = len(user_dict)
    num_cols = len(deal_dict)

    connect('wprec', host='mongodb://10.102.61.251:27017')

    deals = WepickDeal.objects(pk__gte=predict_moment + ' 20',
                               pk__lte=predict_moment + ' 99')
    deal_slots = []
    deal_ids = []
    predict_input = []
    for elem in deals:
        dealid = elem['deal'].id
        if dealid in deal_dict:
            deal_slots.append(int(elem.id[-2:]))
            deal_ids.append(elem['deal'].id)

    deal_finder = dict(zip(deal_dict, range(num_cols)))

    with open(data_path, 'r') as f:
        data = json.load(f)

    indices = []
    values = []

    for idx, elem in enumerate(data):
        indices += zip([idx] * len(elem), elem)
        values += [1.0] * len(elem)
    with tf.Graph().as_default() as graph1:
        sp_mat = tf.SparseTensor(indices, values, [num_rows, num_cols])

        model = WALSModel(num_rows,
                          num_cols,
                          dimension,
                          weight,
                          coef,
                          row_weights=None,
                          col_weights=None)

        row_factors = model.row_factors[0]
        col_factors = model.col_factors[0]

        sess = tf.Session(graph=graph1)

        row_update_op = model.update_row_factors(sp_mat)[1]
        col_update_op = model.update_col_factors(sp_mat)[1]

        sess.run(model.initialize_op)
        for _ in range(n_iter):
            sess.run(model.row_update_prep_gramian_op)
            sess.run(model.initialize_row_update_op)
            sess.run(row_update_op)
            sess.run(model.col_update_prep_gramian_op)
            sess.run(model.initialize_col_update_op)
            sess.run(col_update_op)

    output_row = row_factors.eval(sess)
    output_col = col_factors.eval(sess)

    sess.close()

    results = []

    for i in range(len(deal_ids)):
        deal_index = deal_finder[deal_ids[i]]
        results.append({
            'id':
            deal_ids[i],
            'slot':
            deal_slots[i],
            'score':
            sum(output_row[user_index][:] * output_col[deal_index])
        })
    return results
Esempio n. 11
0
def wals_cate(from_date,
              to_date,
              dimension=10,
              weight=0.5,
              coef=2.0,
              n_iter=30):
    data_path = 'wp_' + from_date + '_' + to_date + '_cate.json'
    cate_dict = np.load('cate_dict.npy')
    user_dict = np.load('user_' + from_date + '_' + to_date + '_for_cate.npy')

    num_rows = len(user_dict)
    num_cols = len(cate_dict)

    with open(data_path, 'r') as f:
        data = json.load(f)

    indices = []
    values = []

    for idx, elem in enumerate(data):
        indices += zip([idx] * len(elem), elem)
        values += [1.0] * len(elem)

    with tf.Graph().as_default() as graph1:
        sp_mat = tf.SparseTensor(indices, values, [num_rows, num_cols])

        model = WALSModel(num_rows,
                          num_cols,
                          dimension,
                          weight,
                          coef,
                          row_weights=None,
                          col_weights=None)

        row_factors = model.row_factors[0]
        col_factors = model.col_factors[0]

        sess = tf.Session(graph=graph1)

        row_update_op = model.update_row_factors(sp_mat)[1]
        col_update_op = model.update_col_factors(sp_mat)[1]

        sess.run(model.initialize_op)
        for _ in range(n_iter):
            sess.run(model.row_update_prep_gramian_op)
            sess.run(model.initialize_row_update_op)
            sess.run(row_update_op)
            sess.run(model.col_update_prep_gramian_op)
            sess.run(model.initialize_col_update_op)
            sess.run(col_update_op)

    output_row = row_factors.eval(sess).tolist()
    output_col = col_factors.eval(sess).tolist()

    sess.close()

    # temporary mechanism for generated matrice
    random.seed()
    temp_num = str(random.randrange(100))

    user_temp_name = 'temp_user' + temp_num
    item_temp_name = 'temp_item' + temp_num

    with open('../' + user_temp_name + '.json', 'w') as f:
        json.dump(output_row, f)
    with open('../' + item_temp_name + '.json', 'w') as f:
        json.dump(output_col, f)

    print('files saved')

    return dimension, user_temp_name, item_temp_name