Exemple #1
0
 def fit(self,
         X,
         y,
         order=2,
         rank=10,
         lr=0.001,
         n_epochs=1,
         batch_size=100,
         std=0.001,
         lda=1e-6,
         log_dir='/tmp/jprior/logs',
         verbosity=0):
     self._clf = TFFMRegressor(**self._sub_params)
     '''
         seed=0,
         order=order,
         rank=rank,
         optimizer=tf.train.FtrlOptimizer(learning_rate=lr),
         n_epochs=n_epochs,
         batch_size=batch_size,
         # smaller init_std -> lower contribution of higher order terms
         init_std=std,
         reg=lda,
         #input_type='sparse',
         log_dir=log_dir,
         verbose=verbosity
     )
     '''
     # tffm doesn't deal with DataFrames correctly (although it tries...)
     self._clf.fit(X[self.inputs].values, y.values, show_progress=True)
     return
Exemple #2
0
    def __init__(self, wfm_data, m_name, order, k, bs, lr, init, reg):
        self.data = wfm_data

        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        #os.environ["CUDA_VISIBLE_DEVICES"] = "1"
        #session_config = tf.ConfigProto(log_device_placement=False, device_count={'GPU': 1})

        self.lr = lr
        self.num_cand = 1000
        self.m_name = m_name

        self.model = TFFMRegressor(
            order=order,
            rank=k,
            optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            session_config=tf.ConfigProto(log_device_placement=False, device_count={'GPU': 1}),
            n_epochs=1,
            batch_size=bs,
            init_std=init,
            reg=reg,
            input_type='sparse',
            seed=42,
        )

        if m_name == 'bpr':
            self.model.core.mf = True

        if m_name == 'bpr' or m_name == 'fmp' or m_name == 'wfmp':
            self.model.core.loss_function = ut.loss_bpr

        if m_name == 'wfm' or m_name == 'wfmp':
            self.model.core.G = list(self.data.gr_train.values())
            self.model.core.gamma_init = np.array(self.data.dr.weights).astype(np.float32)
            if self.data.w_init == 'all-one' or self.data.w_init == 'all-diff':
                self.model.core.M = np.repeat(True, len(self.data.dr.weights))
            else:
                self.model.core.M = np.append([False, False], np.repeat(True, len(self.data.dr.weights) - 2))

        fit_methods = {'bpr': 'fit_bpr', 'fm': 'fit', 'fmp': 'fit_bpr', 'wfm': 'fit', 'wfmp': 'fit_bpr', 'wmf': 'fit'}

        self.fit_method = fit_methods[m_name]
        self.c = None
        if m_name == 'wmf':
            self.c = self.data.dr.c
            self.model.core.has_conf = True

        print('preparing test matrix...')
        if self.data.dataset == 'frappe':
            self.X_test = self.get_test_matrix_opr()
        else:
            self.X_test, self.rel_c = self.get_test_matrix_ub()
Exemple #3
0
class ArtistResponse(object):
    def __init__(self):
        self.model = TFFMRegressor(
            optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
            n_epochs=1000,
            input_type='sparse')

    def generate_feature_matrix(self, info, artist_names, favored_artists,
                                column_map):
        """
        Generate the feature matrix
        :param info: User information
        :param artist_names: Artists we want to predict for
        :param column_map: Feature vector column mapping
        :return: sparse feature matrix
        """
        # We create a matrix of fea ture vectors for each potential artist
        X = np.zeros((len(artist_names), len(column_map)))

        # Feature matrix will have the same values for the user information fields
        X[:, 0] = info["age"]
        X[:, column_map[f"country_{info['country']['name']}"]] = 1

        if info["gender"] is not None:
            X[:, column_map[f"gender_{info['gender']}"]] = 1

        # Set the proper one-hot vector for artist
        for i, name in enumerate(favored_artists):
            X[i, column_map[f"artistName_{name}"]] = 1

        return sparse.csr_matrix(X)

    def get_top_predicted_artists(self, info, column_map, top_artists, n=10):
        """
        Get the top predicted artists for a user
        :param info: User information
        :param column_map: Feature vector column mapping
        :param top_artists: Artists we want to predict for
        :param n: How many artists we want to return
        :return: list of the top predicted artists in descending order
        """
        X = self.generate_feature_matrix(info, top_artists,
                                         info["artists_names"], column_map)
        self.model.core.set_num_features(X.shape[1])
        self.model.load_state("tffm_model/")
        predictions = self.model.predict(X)
        predicted_artists = list(
            map(lambda artist: top_artists[artist],
                np.argsort(predictions)[::-1]))
        return predicted_artists[:n]
    def tffm(self):
        show_progress = True if not self.onlyResults else False
        X_train, y_train, X_test, y_test = self.X_train.todense(
        ), np.transpose(
            self.y_train).flatten(), self.X_test.todense(), np.transpose(
                self.y_test).flatten()
        if self.onlyResults: environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

        model = TFFMRegressor(
            order=2,
            rank=4,
            optimizer=tf.train.FtrlOptimizer(learning_rate=0.1),
            n_epochs=100,
            batch_size=-1,
            init_std=0.001,
            input_type='dense')

        model.fit(X_train, y_train, show_progress=show_progress)
        predictions = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        if not self.onlyResults:
            print('RMSE: {:.6f}'.format(rmse))
        model.destroy()
        if self.onlyResults:
            print("Completed tffm evaluation.")
        return rmse
    def tffm(self):
        # show_progress = True if not self.onlyResults else False
        X_train, y_train, X_test, y_test = self.X_train.todense(), np.transpose(self.y_train).flatten(), self.X_test.todense(), np.transpose(self.y_test).flatten()
        # if self.onlyResults: environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

        model = TFFMRegressor(
            order=2,
            rank=4,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
            n_epochs=100,
            batch_size=-1,
            init_std=0.001,
            input_type='dense'
        )

        model.fit(X_train, y_train, show_progress=True)
        predictions = model.predict(X_test)

        prec = precision_score(y_test, predictions.round(), average='weighted')
        rec = recall_score(y_test, predictions.round(), average='weighted') 
        fmeasure = 2*((prec*rec)/(prec+rec))
        auc = roc_auc_score(y_test, predictions, average='weighted')
        rmse = np.sqrt(mean_squared_error(y_test, predictions))

        model.destroy()
        print("Completed tffm evaluation.")
        return (auc, rmse)
Exemple #6
0
class FM(ModelBase):
    def get_param_dist(self, X):
        num_rows = X.shape[0]
        num_features = X[self.inputs].shape[1]
        param_dist = {
            'rank': sp_randint(1, num_features),
            'batch_size': sp_randint(1, num_rows),
            'lr': sp_uniform(loc=0.001, scale=0.01),
        }
        return param_dist

    def fit(self,
            X,
            y,
            order=2,
            rank=10,
            lr=0.001,
            n_epochs=1,
            batch_size=100,
            std=0.001,
            lda=1e-6,
            log_dir='/tmp/jprior/logs',
            verbosity=0):
        self._clf = TFFMRegressor(**self._sub_params)
        '''
            seed=0,
            order=order,
            rank=rank,
            optimizer=tf.train.FtrlOptimizer(learning_rate=lr),
            n_epochs=n_epochs,
            batch_size=batch_size,
            # smaller init_std -> lower contribution of higher order terms
            init_std=std,
            reg=lda,
            #input_type='sparse',
            log_dir=log_dir,
            verbose=verbosity
        )
        '''
        # tffm doesn't deal with DataFrames correctly (although it tries...)
        self._clf.fit(X[self.inputs].values, y.values, show_progress=True)
        return
Exemple #7
0
def train(export_path, data, version, args):
    x_train, y_train, x_test, y_test = data
    y_train = y_train * 2 - 1
    print("train date shape is {}".format(x_train.shape))

    if args.log:
        log_dir = './log'
    else:
        log_dir = None

    model = TFFMRegressor(
        order=2,
        rank=args.rank,
        optimizer=tf.train.AdamOptimizer(learning_rate=0.0001),
        n_epochs=1,
        batch_size=128,
        log_dir=log_dir,
        init_std=0.01,
        reg=0.5,
        input_type='sparse')

    base_path = 'ckpt/{}'.format(version)
    path_create(base_path)
    model_path = os.path.join(base_path, 'state.tf')
    print('model path is {}'.format(model_path))
    model.core.set_num_features(x_train.shape[1])
    model.fit(x_train, y_train, show_progress=True)
    print('train the model successfully')
    model.save_state(model_path)
    print('checkpoint save successfully')

    if args.save:
        save = Save(model, export_path)
        save.save()
    return model
 def get_model(cls):
     """Get the model object for this instance, loading it if it's not already loaded."""
     if cls.model == None:
         cls.model = TFFMRegressor(
              order=3,
              rank=7,
              optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
              n_epochs=50,
              batch_size=-1,
              init_std=0.001,
              input_type='sparse'
          )
         cls.model.core.set_num_features(cls.get_num_features())
         cls.model.load_state(os.path.join(model_path, 'tffm_state.tf'))
         
     return cls.model
Exemple #9
0
def runTFFM(train_X, train_y, test_X, test_y, test_X2, params):
    model = TFFMRegressor(**params)
    print_step('Fit TFFM')
    for i in range(rounds):
        model.fit(train_X, train_y.values, n_epochs=iters)
        pred_test_y = model.predict(test_X)
        print_step('Iteration {}/{} -- RMSE: {}'.format(
            i + 1, rounds, rmse(pred_test_y, test_y)))
    print_step('TFFM Predict 2/2')
    pred_test_y2 = model.predict(test_X2)
    return pred_test_y, pred_test_y2
Exemple #10
0
class WfmModel:
    def __init__(self, wfm_data, m_name, order, k, bs, lr, init, reg):
        self.data = wfm_data

        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        #os.environ["CUDA_VISIBLE_DEVICES"] = "1"
        #session_config = tf.ConfigProto(log_device_placement=False, device_count={'GPU': 1})

        self.lr = lr
        self.num_cand = 1000
        self.m_name = m_name

        self.model = TFFMRegressor(
            order=order,
            rank=k,
            optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            session_config=tf.ConfigProto(log_device_placement=False, device_count={'GPU': 1}),
            n_epochs=1,
            batch_size=bs,
            init_std=init,
            reg=reg,
            input_type='sparse',
            seed=42,
        )

        if m_name == 'bpr':
            self.model.core.mf = True

        if m_name == 'bpr' or m_name == 'fmp' or m_name == 'wfmp':
            self.model.core.loss_function = ut.loss_bpr

        if m_name == 'wfm' or m_name == 'wfmp':
            self.model.core.G = list(self.data.gr_train.values())
            self.model.core.gamma_init = np.array(self.data.dr.weights).astype(np.float32)
            if self.data.w_init == 'all-one' or self.data.w_init == 'all-diff':
                self.model.core.M = np.repeat(True, len(self.data.dr.weights))
            else:
                self.model.core.M = np.append([False, False], np.repeat(True, len(self.data.dr.weights) - 2))

        fit_methods = {'bpr': 'fit_bpr', 'fm': 'fit', 'fmp': 'fit_bpr', 'wfm': 'fit', 'wfmp': 'fit_bpr', 'wmf': 'fit'}

        self.fit_method = fit_methods[m_name]
        self.c = None
        if m_name == 'wmf':
            self.c = self.data.dr.c
            self.model.core.has_conf = True

        print('preparing test matrix...')
        if self.data.dataset == 'frappe':
            self.X_test = self.get_test_matrix_opr()
        else:
            self.X_test, self.rel_c = self.get_test_matrix_ub()


    def get_test_matrix_opr(self):
        items = self.data.items
        relevant = self.data.relevant
        c_cols = self.data.cols[2:]
        nc = self.num_cand
        n = relevant.apply(lambda x: len(x) * (nc + 1)).sum()
        ix = self.data.ix

        i_ix = np.zeros(n, dtype=np.int32)
        u_ix = np.zeros(n, dtype=np.int32)
        c_ix = {}
        for c in c_cols:
            c_ix[c] = np.zeros(n, dtype=np.int32)

        l = 0
        for kk in relevant.keys():
            cands = np.random.choice(np.setdiff1d(items, relevant[kk]), nc)
            for i in relevant[kk]:
                u_ix[l:l + nc + 1] = np.repeat(ix[str(kk[0] if len(c_cols) > 0 else kk) + 'UserId'], nc + 1)
                i_ix[l:l + nc] = [ix[str(ii) + 'ItemId'] for ii in cands]
                i_ix[l + nc] = ix[str(i) + 'ItemId']
                for ii, c in enumerate(c_cols):
                    c_ix[c][l:l + nc + 1] = np.repeat(ix[str(kk[ii + 1]) + c], nc + 1)
        
                l += nc + 1
        
        g = len(c_cols) + 2

        data_m = np.ones(n*g,dtype=bool)
        row_ix = np.repeat(np.arange(0, n, dtype=np.int32), g)
        col_ix = np.zeros(n*g, dtype=np.int32)

        col_ix[0::g] = u_ix
        col_ix[1::g] = i_ix

        for ii, c in enumerate(c_cols):
            col_ix[ii+2::g] = c_ix[c]

        p = self.data.p
        X = csr.csr_matrix((data_m, (row_ix, col_ix)), shape=(n, p))

        return X

    def get_test_matrix_ub(self):
        items = self.data.items
        users = self.data.users
        relevant = self.data.relevant
        c_cols = self.data.cols[2:]
        nc = self.num_cand
        n = relevant.apply(lambda x: len(x) + nc).sum()
        ix = self.data.ix

        item_attr = {}
        if self.data.item_attr is not None:
            item_attr = dict(self.data.item_attr)
            c_ix_ = [ix[str(item_attr[i]) + c_cols[0]] for i in items]
            c_ix = np.zeros(n, dtype=np.int32)
        
        i_ix_ = [ix[str(i) + 'ItemId'] for i in items]

        i_ix = np.zeros(n, dtype=np.int32)
        u_ix = np.zeros(n, dtype=np.int32)

        rel_c = []
        l = 0
        for u in users:
            r = np.size(relevant[u])
            u_ix[l:l + nc + r] = np.repeat(ix[str(u) + 'UserId'], nc + r)
            i_ix[l:l + nc] = i_ix_
            i_ix[l + nc: l + nc + r] = [ix[str(i) + 'ItemId'] for i in relevant[u]] 
            if self.data.item_attr is not None:
                c_ix[l:l + nc] = c_ix_
                c_ix[l + nc:l + nc + r] = [ix[str(item_attr[i]) + c_cols[0]] for i in relevant[u]]
            l += nc + r
            rel_c.append(nc + r)

        g = len(c_cols) + 2

        data_m = np.ones(n*g,dtype=bool)
        row_ix = np.repeat(np.arange(0, n, dtype=np.int32), g)
        col_ix = np.zeros(n*g, dtype=np.int32)

        col_ix[0::g] = u_ix
        col_ix[1::g] = i_ix
        if self.data.item_attr is not None:
            col_ix[2::g] = c_ix

        p = self.data.p
        X = csr.csr_matrix((data_m, (row_ix, col_ix)), shape=(n, p))

        return X, rel_c

    def calc_metrics_opr(self, pred, k):
        relevant = self.data.relevant
        nc = self.num_cand
        hit_counts = []
        rrs = []
        l = 0
        for kk in relevant.keys():
            for i in relevant[kk]:
                top_ix = np.argpartition(pred[l:l+nc + 1], -k)[-k:]
                hit_count = len(np.where(top_ix >= nc)[0])  
                hit_counts.append(hit_count)

                top_val = pred[l + top_ix]
                top_ix = map(lambda x: x[0], sorted(zip(top_ix, top_val), key=lambda x: x[1], reverse=True))

                rr = 0
                for j, item_ix in enumerate(top_ix):
                    if (item_ix >= nc):  #if item is relavant
                        rr = 1 / (j + 1)
                        break;
                rrs.append(rr)
                l += nc + 1
    
        recall = np.sum(hit_counts) / np.size(hit_counts)
        mrr = np.mean(rrs)

        return recall, mrr, recall / k

    def calc_metrics_ub(self, pred, k, rel_c):
        nc = self.num_cand
        hit_counts = []
        recalls = []
        rrs = []
        l = 0
        for c in rel_c:
            top_ix = np.argpartition(pred[l:l+c], -k)[-k:]
            hit_count = len(np.where(top_ix >= nc)[0]) 
            hit_counts.append(hit_count)
            recalls.append(hit_count / (c - nc) if c > nc else 0)

            top_val = pred[l + top_ix]
            top_ix = map(lambda x: x[0], sorted(zip(top_ix, top_val), key=lambda x: x[1], reverse=True))
    
            rr = 0
            for j, item_ix in enumerate(top_ix):
                if (item_ix >= nc):  #if item is relavant
                    rr = 1 / (j + 1)
                    break;
            rrs.append(rr)
            l += c
    
        prc = np.sum(hit_counts) / (k * np.size(hit_counts))
        recall = np.mean(recalls)
        mrr = np.mean(rrs)

        return recall, mrr, prc

    def eval_model(self):
        if self.data.dataset == 'frappe':
            pred = self.model.predict(self.X_test, pred_batch_size=100000)
            r5, mrr5, prc5 = self.calc_metrics_opr(pred, 5)
            r10, mrr10, prc10 = self.calc_metrics_opr(pred, 10)
            r20, mrr20, prc20 = self.calc_metrics_opr(pred, 20)
        else:
            pred = self.model.predict(self.X_test, pred_batch_size=1000000)
            r5, mrr5, prc5 = self.calc_metrics_ub(pred, 5, self.rel_c)
            r10, mrr10, prc10 = self.calc_metrics_ub(pred, 10, self.rel_c)
            r20, mrr20, prc20 = self.calc_metrics_ub(pred, 20, self.rel_c)
        
        return r5, r10, r20, mrr5, mrr10, mrr20, prc5, prc10, prc20

    def train_model(self, epochs, eval_freq, eval_file=None):
        writer = None
        if eval_file is not None:
            writer = open(eval_file, 'w')
            writer.write('Method,WeightInit,Context,Epoch,Order,K,BatchSize,LearnRate,InitStd,Reg,Recall@5,Recall@10,Recall@20,MRR@5,MRR@10,MRR@20,Precision@5,Precision@10,Precision@20,EpochTime,EvalTime,Weights,NewEval,Optimizer,MsdContext,NormalizeAlpha\n')

        def eval_epoch(ep_, epoch_time_):
            start_time = time.time()
            r5, r10, r20, mrr5, mrr10, mrr20, prc5, prc10, prc20 = self.eval_model()
            eval_time = time.time() - start_time
            if self.model.core.G is not None:
                ws = reduce(lambda x, y: str(x) + ' ' + str(y), self.model.session.run(self.model.core.alpha))
            else:
                ws = 'NA'

            writer.write('{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15},{16},{17},{18},{19},{20},{21},{22},{23},{24},{25}\n'.format(
                         self.m_name, self.data.w_init, self.data.dr.context, ep_, self.model.core.order, self.model.core.rank, self.model.batch_size,
                         self.lr, self.model.core.init_std, self.model.core.reg,
                         r5, r10, r20, mrr5, mrr10, mrr20, prc5, prc10, prc20, epoch_time_, eval_time, ws,'True2','GD','Genre',self.model.core.norm_alpha))
            writer.flush()

        total_time = 0
        for ep in tqdm(range(epochs), unit='epoch'):
            start_time = time.time()
            if self.fit_method == 'fit':
                self.model.fit(self.data.X_train, self.data.y_train, c_=self.c)
            else:
                self.model.fit_bpr(self.data.X_train, self.data.X_train_neg)

            epoch_time = time.time() - start_time
            total_time += epoch_time

            if (ep + 1) % eval_freq == 0:
                eval_epoch(ep + 1, epoch_time)

        if writer is not None:
            writer.close()
def main():

    seed = 123  # Random seed
    data_dir = "../validation_data_train/"

    n_epochs = 200  # Number of epochs
    learning_rate = 0.001  # Learning rate of the optimizer
    batch_size = 1024  # Batch size
    init_std = 0.01  # Initial standard deviation
    input_type = 'sparse'  # Input type: 'sparse' or 'dense'
    reg = 10**4  # Regularization parameter
    rank = 100  # Rank of the factorization
    order = 5  # comboFM order

    print('GPU available:')
    print(tf.test.is_gpu_available())

    ### Training data forr validation experiment

    # Features in position 1: Drug A - Drug B
    features_tensor_1 = ("drug1_concentration__one-hot_encoding.csv",
                         "drug2_concentration__one-hot_encoding.csv",
                         "drug1__one-hot_encoding.csv",
                         "drug2__one-hot_encoding.csv",
                         "cell_lines__one-hot_encoding.csv")
    features_auxiliary_1 = ("drug1_drug2_concentration__values.csv",
                            "drug1__estate_fingerprints.csv",
                            "drug2__estate_fingerprints.csv",
                            "cell_lines__gene_expression.csv")
    X_tensor_1 = concatenate_features(data_dir, features_tensor_1)
    X_auxiliary_1 = concatenate_features(data_dir, features_auxiliary_1)
    X_1 = np.concatenate((X_tensor_1, X_auxiliary_1), axis=1)

    # Features in position 2: Drug B - Drug A
    features_tensor_2 = ("drug2_concentration__one-hot_encoding.csv",
                         "drug1_concentration__one-hot_encoding.csv",
                         "drug2__one-hot_encoding.csv",
                         "drug1__one-hot_encoding.csv",
                         "cell_lines__one-hot_encoding.csv")
    features_auxiliary_2 = ("drug2_drug1_concentration__values.csv",
                            "drug2__estate_fingerprints.csv",
                            "drug1__estate_fingerprints.csv",
                            "cell_lines__gene_expression.csv")
    X_tensor_2 = concatenate_features(data_dir, features_tensor_2)
    X_auxiliary_2 = concatenate_features(data_dir, features_auxiliary_2)
    X_2 = np.concatenate((X_tensor_2, X_auxiliary_2), axis=1)

    # Concatenate the features from both positions vertically
    X_tr = np.concatenate((X_1, X_2), axis=0)
    print('Dataset shape: {}'.format(X_tr.shape))
    print('Non-zeros rate: {:.05f}'.format(np.mean(X_tr != 0)))
    print('Number of one-hot encoding features: {}'.format(
        X_tensor_1.shape[1]))
    print('Number of auxiliary features: {}'.format(X_auxiliary_1.shape[1]))
    i_aux = X_tensor_1.shape[1]
    del X_tensor_1, X_auxiliary_1, X_tensor_2, X_auxiliary_2, X_1, X_2

    # Read responses
    y_tr = np.loadtxt("../validation_data_train/responses.csv",
                      delimiter=",",
                      skiprows=1)
    y_tr = np.concatenate((y_tr, y_tr), axis=0)

    ### Validation data

    # Validation set features
    data_dir = "../validation_data/"
    X_tensor_1 = concatenate_features(data_dir, features_tensor_1)
    X_auxiliary_1 = concatenate_features(data_dir, features_auxiliary_1)
    X_val = np.concatenate((X_tensor_1, X_auxiliary_1), axis=1)

    print('Validation dataset shape: {}'.format(X_val.shape))
    print('Non-zeros rate: {:.05f}'.format(np.mean(X_val != 0)))
    print('Number of one-hot encoding features: {}'.format(
        X_tensor_1.shape[1]))
    print('Number of auxiliary features: {}'.format(X_auxiliary_1.shape[1]))

    i_aux = X_tensor_1.shape[1]
    del X_tensor_1, X_auxiliary_1

    X_tr, X_val = standardize(X_tr, X_val, i_aux)

    if input_type == 'sparse':
        X_tr = sp.csr_matrix(X_tr)
        X_val = sp.csr_matrix(X_val)

    model = TFFMRegressor(
        order=order,
        rank=rank,
        n_epochs=n_epochs,
        optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
        batch_size=batch_size,
        init_std=init_std,
        reg=reg,
        input_type=input_type,
        seed=seed)

    # Train the model
    model.fit(X_tr, y_tr, show_progress=True)

    # Predict
    y_pred_val = model.predict(X_val)

    np.savetxt("results/validation_set_predictions.txt", y_pred_val)
Exemple #12
0
        # 						  )
        # 	model.fit(train_queue[0], train_queue[1], show_progress=True)
        # 	inferences = model.predict(test_queue[0])
        # 	mse = mean_squared_error(test_queue[1], inferences)
        # 	rmse = np.sqrt(mse)
        # 	logging.info('rmse: %.4f[%.4f]' % (rmse, time()-start))

        from tffm import TFFMRegressor
        import tensorflow as tf
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
        model = TFFMRegressor(
            order=dim,
            rank=args.embedding_dim,
            optimizer=tf.train.AdagradOptimizer(learning_rate=args.lr),
            n_epochs=args.train_epochs,
            # batch_size=1076946,
            batch_size=4096,
            init_std=0.001,
            reg=args.weight_decay,
            input_type='sparse',
            log_dir=os.path.join(args.save, save_name),
        )
        model.fit(train_queue[0], train_queue[1], show_progress=True)
        inferences = model.predict(test_queue[0])
        mse = mean_squared_error(test_queue[1], inferences)
        rmse = np.sqrt(mse)
        logging.info('rmse: %.4f[%.4f]' % (rmse, time() - start))

    elif args.mode == 'autoneural':
        start = time()
        if dim == 2:
            model = AutoNeural(num_users, num_items, args.embedding_dim,
Exemple #13
0
def main():
    torch.set_default_tensor_type(torch.FloatTensor)
    torch.set_num_threads(3)
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(args.seed)
    torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    data_start = time.time()
    if args.dataset == 'ml-100k':
        num_users = 943
        num_items = 1682
        dim = 2
    elif args.dataset == 'ml-1m':
        num_users = 6040
        num_items = 3952
        dim = 2
    elif args.dataset == 'ml-10m':
        num_users = 71567
        num_items = 65133
        dim = 2
    elif args.dataset == 'youtube-small':
        num_ps = 600
        num_qs = 14340
        num_rs = 5
        dim = 3

    train_queue, valid_queue, test_queue = utils.get_data_queue(args)
    logging.info('prepare data finish! [%f]' % (time.time() - data_start))

    if args.mode == 'libfm':
        start = time.time()
        from tffm import TFFMRegressor
        import tensorflow as tf
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
        model = TFFMRegressor(
            order=dim,
            rank=args.embedding_dim,
            optimizer=tf.train.AdagradOptimizer(learning_rate=args.lr),
            n_epochs=args.train_epochs,
            batch_size=args.batch_size,
            init_std=0.001,
            reg=args.weight_decay,
            input_type='sparse',
            log_dir=os.path.join(save_name, 'libfm-log'))
        model.fit(train_queue[0], train_queue[1], show_progress=True)
        inferences = model.predict(test_queue[0])
        mse = mean_squared_error(test_queue[1], inferences)
        rmse = np.sqrt(mse)
        logging.info('rmse: %.4f[%.4f]' % (rmse, time.time() - start))

    else:
        start = time.time()
        if args.mode == 'ncf':
            if dim == 2:
                model = NCF(num_users, num_items, args.embedding_dim,
                            args.weight_decay).cuda()
            elif dim == 3:
                model = NCF_Triple(num_ps, num_qs, num_rs, args.embedding_dim,
                                   args.weight_decay).cuda()
        elif args.mode == 'deepwide':
            if dim == 2:
                model = DeepWide(num_users, num_items, args.embedding_dim,
                                 args.weight_decay).cuda()
            elif dim == 3:
                model = DeepWide_Triple(num_ps, num_qs, num_rs,
                                        args.embedding_dim,
                                        args.weight_decay).cuda()
        elif args.mode == 'altgrad':
            model = AltGrad(num_users, num_items, args.embedding_dim,
                            args.weight_decay).cuda()
        elif args.mode == 'convncf':
            model = ConvNCF(num_users, num_items, args.embedding_dim,
                            args.weight_decay).cuda()
        elif args.mode == 'outer':
            model = Outer(num_users, num_items, args.embedding_dim,
                          args.weight_decay).cuda()
        elif args.mode == 'conv':
            model = Conv(num_users, num_items, args.embedding_dim,
                         args.weight_decay).cuda()
        elif args.mode == 'plus':
            model = Plus(num_users, num_items, args.embedding_dim,
                         args.weight_decay).cuda()
        elif args.mode == 'max':
            model = Max(num_users, num_items, args.embedding_dim,
                        args.weight_decay).cuda()
        elif args.mode == 'min':
            model = Min(num_users, num_items, args.embedding_dim,
                        args.weight_decay).cuda()
        elif args.mode == 'cp':
            model = CP(num_ps, num_qs, num_rs, args.embedding_dim,
                       args.weight_decay).cuda()
        elif args.mode == 'tucker':
            model = TuckER(num_ps, num_qs, num_rs, args.embedding_dim,
                           args.weight_decay).cuda()
        elif args.mode == 'sif':
            if dim == 2:
                arch = utils.load_arch(num_users, num_items, args)
                print(next(arch['mlp']['p'].parameters()))
                model = Network(num_users, num_items, args.embedding_dim, arch,
                                args.weight_decay).cuda()
            elif dim == 3:
                arch = utils.load_arch_triple(num_ps, num_qs, num_rs, args)
                model = Network_Triple(num_ps, num_qs, num_rs,
                                       args.embedding_dim, arch,
                                       args.weight_decay).cuda()
        logging.info('build model finish! [%f]' % (time.time() - start))

        optimizer = torch.optim.Adagrad(model.parameters(), args.lr)
        if dim == 2:
            train(model, train_queue, test_queue, optimizer, args)
            rmse = evaluate(model, test_queue)
        elif dim == 3:
            train_triple(model, train_queue, test_queue, optimizer, args)
            rmse = evaluate_triple(model, test_queue)
        logging.info('rmse: %.4f' % rmse)
Exemple #14
0
            use_info = True,
            path = path,
            )
    learner.fit(train)
    '''

    # TEST tffm.
    # https://github.com/geffy/tffm
    from tffm import TFFMRegressor
    import tensorflow as tf
    learner = simple_fm.SimpleFMLearner(
        external_fm=TFFMRegressor(
            order=2,
            rank=12,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
            n_epochs=300,
            batch_size=128,
            init_std=0.001,
            reg=0.001,
            input_type='sparse'),
        use_info=True,
        path=path,
    )
    learner.fit(train)

    # calculate inverse count.
    order = []
    for uid, iid, rating in test:
        y_ = learner.predict(uid, iid)
        #print >> sys.stderr, '%d\t%d\t%.4f\t%d' % (uid, iid, y_, rating)
        order.append((y_, rating))
y = np.reshape( y, (y.shape[0],) )

X = sparse.csr_matrix(data_train_FM.drop(columns=['FREQUENCY','CUST_ID','ARTICLE_ID']).to_numpy())

del data_train_FM

rank = 20
l_r = 0.05
reg = 0.001
epoch =  200

model_tf = TFFMRegressor(
    order=2,
    rank=rank,
    optimizer=tf.train.AdamOptimizer(learning_rate=l_r),
    reg=reg,
    n_epochs=epoch,
    init_std=0.001,
    input_type='sparse'
)


data_reco_baselines_score = pd.read_csv(d+'/data_reco_baselines_score.csv')

data_reco_baselines_score = data_reco_baselines_score[['NB_PURCH_TEST', 'NB_ARTICLE_PURCH_TEST' ,'ARM_PRECISION', 'SVD_PURE_PRECISION','NMF_PRECISION','K100_PRECISION',\
 'VAES_PRECISION','SPEC_PRECISION','CUST_ID','ARTICLE_ID']]

protocol = pd.read_csv(d+'/test_protocol.csv')
protocol = protocol.drop_duplicates()

data_reco_baselines = pd.read_csv(d+'/data_reco_baselines.csv')
Exemple #16
0
def main(argv):

    seed = 123  # Random seed
    data_dir = "../data/"

    n_epochs_inner = 100  # Number of epochs in the inner loop
    n_epochs_outer = 200  # Number of epochs in the outer loop
    learning_rate = 0.001  # Learning rate of the optimizer
    batch_size = 1024  # Batch size
    init_std = 0.01  # Initial standard deviation
    input_type = 'sparse'  # Input type: 'sparse' or 'dense'
    order = 5  # Order of the factorization machine (comboFM)
    nfolds_outer = 10  # Number of folds in the outer loop
    nfolds_inner = 5  # Number of folds in the inner loop

    regparams = [10**2, 10**3, 10**4,
                 10**5]  # Regularization parameter: to be optimized
    ranks = [25, 50, 75, 100]  # Rank of the factorization: to be optimized

    # Experiment: 1) new_dose-response_matrix_entries, 2) new_dose-response_matrices, 3) new_drug_combinations"""
    experiment = argv[2]

    id_in = int(argv[1])
    print("\nJob ID: %d" % id_in)

    print('GPU available:')
    print(tf.test.is_gpu_available())

    # Features in position 1: Drug A - Drug B
    features_tensor_1 = ("drug1_concentration__one-hot_encoding.csv",
                         "drug2_concentration__one-hot_encoding.csv",
                         "drug1__one-hot_encoding.csv",
                         "drug2__one-hot_encoding.csv",
                         "cell_lines__one-hot_encoding.csv")
    features_auxiliary_1 = ("drug1_drug2_concentration__values.csv",
                            "drug1__estate_fingerprints.csv",
                            "drug2__estate_fingerprints.csv",
                            "cell_lines__gene_expression.csv")
    X_tensor_1 = concatenate_features(data_dir, features_tensor_1)
    X_auxiliary_1 = concatenate_features(data_dir, features_auxiliary_1)
    X_1 = np.concatenate((X_tensor_1, X_auxiliary_1), axis=1)

    # Features in position 2: Drug B - Drug A
    features_tensor_2 = ("drug2_concentration__one-hot_encoding.csv",
                         "drug1_concentration__one-hot_encoding.csv",
                         "drug2__one-hot_encoding.csv",
                         "drug1__one-hot_encoding.csv",
                         "cell_lines__one-hot_encoding.csv")
    features_auxiliary_2 = ("drug2_drug1_concentration__values.csv",
                            "drug2__estate_fingerprints.csv",
                            "drug1__estate_fingerprints.csv",
                            "cell_lines__gene_expression.csv")
    X_tensor_2 = concatenate_features(data_dir, features_tensor_2)
    X_auxiliary_2 = concatenate_features(data_dir, features_auxiliary_2)
    X_2 = np.concatenate((X_tensor_2, X_auxiliary_2), axis=1)

    # Concatenate the features from both positions vertically
    X = np.concatenate((X_1, X_2), axis=0)
    print('Dataset shape: {}'.format(X.shape))
    print('Non-zeros rate: {:.05f}'.format(np.mean(X != 0)))
    print('Number of one-hot encoding features: {}'.format(
        X_tensor_1.shape[1]))
    print('Number of auxiliary features: {}'.format(X_auxiliary_1.shape[1]))
    i_aux = X_tensor_1.shape[1]
    del X_tensor_1, X_auxiliary_1, X_tensor_2, X_auxiliary_2, X_1, X_2

    # Read responses
    y = np.loadtxt("../data/responses.csv", delimiter=",", skiprows=1)
    y = np.concatenate((y, y), axis=0)

    inner_folds = list(range(1, nfolds_inner + 1))
    outer_folds = list(range(1, nfolds_outer + 1))

    outer_fold = outer_folds[id_in]
    te_idx = np.loadtxt(
        '../cross-validation_folds/%s/test_idx_outer_fold-%d.txt' %
        (experiment, outer_fold)).astype(int)
    tr_idx = np.loadtxt(
        '../cross-validation_folds/%s/train_idx_outer_fold-%d.txt' %
        (experiment, outer_fold)).astype(int)

    X_tr, X_te, y_tr, y_te = X[tr_idx, :], X[te_idx, :], y[tr_idx], y[te_idx]

    print('Training set shape: {}'.format(X_tr.shape))
    print('Test set shape: {}'.format(X_te.shape))

    CV_RMSE_reg = np.zeros([len(regparams), nfolds_inner])
    CV_RPearson_reg = np.zeros([len(regparams), nfolds_inner])
    CV_RSpearman_reg = np.zeros([len(regparams), nfolds_inner])

    rank = 50  # Fix rank first to 50 while optimizing regularization

    for reg_i in range(len(regparams)):

        reg = regparams[reg_i]

        for inner_fold in inner_folds:
            print("INNER FOLD: %d" % inner_fold)
            print("Rank: %d" % rank)
            print("Regularization: %d" % reg)

            te_idx_CV = np.loadtxt(
                '../cross-validation_folds/%s/test_idx_outer_fold-%d_inner_fold-%d.txt'
                % (experiment, outer_fold, inner_fold)).astype(int)
            tr_idx_CV = np.loadtxt(
                '../cross-validation_folds/%s/train_idx_outer_fold-%d_inner_fold-%d.txt'
                % (experiment, outer_fold, inner_fold)).astype(int)
            X_tr_CV, X_te_CV, y_tr_CV, y_te_CV = X[tr_idx_CV, :], X[
                te_idx_CV, :], y[tr_idx_CV], y[te_idx_CV]
            X_tr_CV, X_te_CV = standardize(
                X_tr_CV, X_te_CV, i_aux
            )  # i_aux: length of one-hot encoding, not to be standardized

            if input_type == 'sparse':
                X_tr_CV = sp.csr_matrix(X_tr_CV)
                X_te_CV = sp.csr_matrix(X_te_CV)

            model = TFFMRegressor(
                order=order,
                rank=rank,
                n_epochs=n_epochs_inner,
                optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
                batch_size=batch_size,
                init_std=init_std,
                reg=reg,
                input_type=input_type,
                seed=seed)

            # Train the model
            model.fit(X_tr_CV, y_tr_CV, show_progress=True)

            # Predict
            y_pred_te_CV = model.predict(X_te_CV)

            # Evaluate performance
            RMSE = np.sqrt(mean_squared_error(y_te_CV, y_pred_te_CV))
            CV_RMSE_reg[reg_i, inner_fold - 1] = RMSE
            RPearson = np.corrcoef(y_te_CV, y_pred_te_CV)[0, 1]
            CV_RPearson_reg[reg_i, inner_fold - 1] = RPearson
            RSpearman, _ = spearmanr(y_te_CV, y_pred_te_CV)
            CV_RSpearman_reg[reg_i, inner_fold - 1] = RSpearman

            model.destroy()

            print("RMSE: %f\nR_pearson: %f\nR_spearman: %f" %
                  (RMSE, RPearson, RSpearman))

    CV_avg_reg = np.mean(CV_RPearson_reg, axis=1)
    reg_i = np.where(CV_avg_reg == np.max(CV_avg_reg))[0]
    reg = regparams[int(reg_i)]
    np.savetxt(
        'results/%s/outer_fold-%d_reg_CV_avg_RPearson.txt' %
        (experiment, outer_fold), CV_avg_reg)

    CV_RMSE_rank = np.zeros([len(ranks), nfolds_inner])
    CV_RPearson_rank = np.zeros([len(ranks), nfolds_inner])
    CV_RSpearman_rank = np.zeros([len(ranks), nfolds_inner])

    for rank_i in range(len(ranks)):
        rank = ranks[rank_i]
        for inner_fold in inner_folds:

            print("INNER FOLD: %d" % inner_fold)
            print("Rank: %d" % rank)
            print("Regularization: %d" % reg)

            te_idx_CV = np.loadtxt(
                '../cross-validation_folds/%s/test_idx_outer_fold-%d_inner_fold-%d.txt'
                % (experiment, outer_fold, inner_fold)).astype(int)
            tr_idx_CV = np.loadtxt(
                '../cross-validation_folds/%s/train_idx_outer_fold-%d_inner_fold-%d.txt'
                % (experiment, outer_fold, inner_fold)).astype(int)

            X_tr_CV, X_te_CV, y_tr_CV, y_te_CV = X[tr_idx_CV, :], X[
                te_idx_CV, :], y[tr_idx_CV], y[te_idx_CV]
            X_tr_CV, X_te_CV = standardize(X_tr_CV, X_te_CV, i_aux)

            if input_type == 'sparse':
                X_tr_CV = sp.csr_matrix(X_tr_CV)
                X_te_CV = sp.csr_matrix(X_te_CV)

            model = TFFMRegressor(
                order=order,
                rank=rank,
                n_epochs=n_epochs_inner,
                optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
                batch_size=batch_size,
                init_std=init_std,
                reg=reg,
                input_type=input_type,
                seed=seed)

            # Train the model
            model.fit(X_tr_CV, y_tr_CV, show_progress=True)

            # Predict
            y_pred_te_CV = model.predict(X_te_CV)

            #  Evaluate performance
            RMSE = np.sqrt(mean_squared_error(y_te_CV, y_pred_te_CV))
            CV_RMSE_rank[rank_i, inner_fold - 1] = RMSE
            RPearson = np.corrcoef(y_te_CV, y_pred_te_CV)[0, 1]
            CV_RPearson_rank[rank_i, inner_fold - 1] = RPearson
            RSpearman, _ = spearmanr(y_te_CV, y_pred_te_CV)
            CV_RSpearman_rank[rank_i, inner_fold - 1] = RSpearman

            model.destroy()

            print("RMSE: %f\nR_pearson: %f\nR_spearman: %f" %
                  (RMSE, RPearson, RSpearman))

    CV_avg_rank = np.mean(CV_RPearson_rank, axis=1)
    rank_i = np.where(CV_avg_rank == np.max(CV_avg_rank))[0]
    rank = ranks[int(rank_i)]

    np.savetxt(
        'results/%s/outer_fold-%d_rank_CV_avg_RPearson.txt' %
        (experiment, outer_fold), CV_avg_rank)

    X_tr, X_te = standardize(X_tr, X_te, i_aux)

    if input_type == 'sparse':
        X_tr = sp.csr_matrix(X_tr)
        X_te = sp.csr_matrix(X_te)

    model = TFFMRegressor(
        order=order,
        rank=rank,
        n_epochs=n_epochs_outer,
        optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
        batch_size=batch_size,
        init_std=init_std,
        reg=reg,
        input_type=input_type,
        seed=seed)

    # Train the model
    model.fit(X_tr, y_tr, show_progress=True)

    # Predict
    y_pred_te = model.predict(X_te)

    RPearson = np.corrcoef(y_te, y_pred_te)[0, 1]

    print("RMSE: %f\nR_pearson: %f\nR_spearman: %f" %
          (RMSE, RPearson, RSpearman))

    np.savetxt(
        "results/%s/outer-fold-%d_y_test_order-%d_rank-%d_reg-%d_%s.txt" %
        (experiment, outer_fold, order, rank, reg, experiment), y_te)
    np.savetxt(
        "results/%s/outer-fold-%d_y_pred_order-%d_rank-%d_reg-%d_%s.txt" %
        (experiment, outer_fold, order, rank, reg, experiment), y_pred_te)

    # Save model weights
    weights = model.weights
    for i in range(order):
        np.savetxt(
            'results/%s/outer-fold-%d_P_order%d_rank-%d_reg-%.1e.txt' %
            (experiment, outer_fold, i + 1, rank, reg), weights[i])
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('../data/xgb_submission_2020-06-14_03.csv', index=False)

plot_features(model, (10,14))

# tffm sandbox
from tffm import TFFMClassifier, TFFMRegressor
model = TFFMRegressor(
    order=2,
    rank=10,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
    n_epochs=100,
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='dense'
)

X_train.fillna(0.0, inplace=True)
for c in X_train.columns:
    if X_train[c].isna().any():
        print(c)
    if np.isinf(X_train[c]).any():
        print(c)

model.fit(X_train.values.astype('float32'), Y_train.values.astype('float32'), show_progress=True)

from sklearn.metrics import mean_squared_error
Exemple #18
0
merged1 = pd.merge(transformed_buys,
                   historical_buy_data,
                   left_index=True,
                   right_index=True)
merged2 = pd.merge(merged1,
                   historical_click_data,
                   left_index=True,
                   right_index=True)

# Create the MF model, you can play around with the parameters

model = TFFMRegressor(order=2,
                      rank=7,
                      optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
                      n_epochs=100,
                      batch_size=-1,
                      init_std=0.001,
                      input_type='dense')

merged2.drop(
    ['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID'],
    1,
    inplace=True)
X = np.array(merged2)
X = np.nan_to_num(X)
y = np.array(merged2['Quantity'].as_matrix())

# Split data into train, test

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)
# ** create np.array from X_train_withoutUsers **

# In[29]:

X_train_withoutUsersArray = np.array(X_train_withoutUsers)
X_test_withoutUsersArray = np.array(X_test_withoutUsers)

# ** Run model **

# In[30]:

model = TFFMRegressor(order=2,
                      rank=7,
                      optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
                      n_epochs=30,
                      batch_size=-1,
                      init_std=0.001,
                      input_type='dense')

# In[31]:

model.fit(X_train_withoutUsersArray, y_tr, show_progress=True)
predictions = model.predict(X_test_withoutUsersArray)
print('MSE: {}'.format(mean_squared_error(y_te, predictions)))

# ## Make predictions:
#
# (this is the messy part - very difficult to predict new movies using such a sparse array)

# Checking out how many unique users there are:
Exemple #20
0
 def __init__(self):
     self.model = TFFMRegressor(
         optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
         n_epochs=1000,
         input_type='sparse')
Exemple #21
0
#valid_examples_pos = reader.valid_examples_pos
words = reader.words
meta_vector = reader.meta_vector
X = reader.X
X_ids = reader.X_ids
X_weights = reader.X_weights
Y = reader.Y

model = TFFMRegressor(
    num_unique_meta=len_unique_meta,
    meta_vector=meta_vector,
    num_features=vocab_size,
    order=2, 
    rank=dimensions, 
    # optimizer=tf.train.AdamOptimizer(learning_rate=lr),   # lr = 0.001
    optimizer=tf.train.AdagradOptimizer(learning_rate=lr),  # lr = 0.05
    n_epochs=iterations, 
    batch_size=batch_size,
    init_std=0.01,
    reg=0.02,
    reweight_reg=False,
    count_max=count_max,
    input_type='sparse',
    log_dir=log_path,
    valid_examples=valid_examples_words,
    words=words,
    write_embedding_every=10,
    session_config=tf.ConfigProto(log_device_placement=False), 
    verbose=2
)
model.fit(X, X_ids, X_weights, Y, show_progress=True)