Beispiel #1
0
    del (X_description1)
    del (X_description2)
    del (X_description3)
    print(train_X.shape)
    print(valid_X.shape)
    print('[{}] addition feature completed.'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.03, beta=0.01, L1=0.001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.07, L2_fm=0.001,
                    init_fm=0.01,
                    D_fm=400, e_noise=0.0001, iters=1, inv_link="identity", threads=4, weight_fm=1.0)
    for i in range(4):
        model.fit(train_X, train_y)
        if debug:
            preds = model.predict(X=valid_X)
            print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
        param = model.__getstate__()
        model.__setstate__((param[0], param[1], param[2], param[3], param[4] * 0.8, param[5], param[6], param[7],
                            param[8], param[9], param[10], param[11], param[12], param[13], param[14], param[15],
                            param[16], param[17]
                            , param[18], param[19]))
    if debug:
        resdefm = preds
    resf = model.predict(X=X_test)
    res2.extend(resf)
    del (param)
    del (model)
    del (sparse_merge)
    del (train_X)
    del (train_y)
    del (valid_X)
    del (valid_y)
Beispiel #2
0
class WBFmFtrlModel(object):
    wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word",
                                                     "lowercase": False, "n_features": D,
                                                     "norm": None, "binary": True})
                         , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0)
    #clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0,
    #          D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0)

    def __init__(self,pretrain_files,train_file, test_file):
        self.pretrain_files = pretrain_files
        self.train_file = train_file
        self.test_file = test_file
        self.clf = None
        self.pretrain_model_fn = "wb_fmftrl_v26_pretrain.model"

    def create_clf(self):
        if self.clf is not None:
            del(self.clf)
            gc.collect()
        self.clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0,
                      D_fm=16, e_noise=0.0, iters=5, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0)

    def get_data(self, loader, fold= -1, chunk_size=10000000, file_size=40000000):
        if fold > 0:
            size_per_fold = int(file_size/fold)
        else:
            size_per_fold = chunk_size

        for (idx, df) in loader.get_chunk_data():
            data = df[predictors].values
            labels = df['click_id'].values
            weights = df['weight'].values
            if fold == -1:
                fold_num = -1
            else:
                fold_num = int(idx / size_per_fold)
            del(df)
            gc.collect()

            str_array = df2csr(data)
            X = self.wb.transform(str_array)
            del(str_array)
            del(data)
            gc.collect()
            yield (idx, fold_num, X, labels, weights)

    def do_thread_execute(self,target,clf, X, labels=None, weights=None,do_free=True):
        #str_array = df2csr(data)
        #gc.collect()
        #X = self.wb.transform(str_array)
        if labels is not None:
            args = (clf, X, labels, weights)
        else:
            args = (clf, X)
        p = ThreadWithReturnValue(target=target,args =args)
        p.start()
        ret = p.join()
        if do_free:
            del(X)
            if labels is not None:
                del(labels)
            if weights is not None:
                del(weights)
        gc.collect()

        return ret

    def predict(self,predict_file):
        test_preds = []
        click_ids = []
        test_loader = DataPiper(predict_file,logger)
        for (idx, fold_num, X, labels, weights) in self.get_data(test_loader):
            click_ids+= labels.tolist()
            test_preds += list(self.do_thread_execute(predict_batch,self.clf,X))

        return click_ids, test_preds

    def predict_data(self, X, labels, weights):
        return predict_batch(self.clf, X)

    def pretrain(self):
        p = None
        X = None
        rcount = 0

        start_time = time.time()

        self.create_clf()

        if not os.path.exists(self.pretrain_model_fn):
            print("Pretrain the model")
            for pretrain_file in self.pretrain_files:
                print("Pretrain using file:{}".format(pretrain_file))
                loader = DataPiper(pretrain_file,logger)
                for (idx, fold_num, X, labels, weights) in self.get_data(loader):
                    self.do_thread_execute(fit_batch,self.clf,X,labels,weights)

            with open(self.pretrain_model_fn,"wb") as f:
                params = self.clf.__getstate__() #self.create_clf()
                pkl.dump(params,f)
            #self.clf.pickle_model(self.pretrain_model_fn)
        else:
            with open(self.pretrain_model_fn,"rb") as f:
                params = pkl.load(f)
                self.clf.__setstate__(params)
            #self.clf.unpickle_model(self.pretrain_model_fn)

    def train_all(self):
        p = None
        X = None
        rcount = 0

        start_time = time.time()

        self.create_clf()

        print("Pretrain the model")
        self.pretrain()
        """
        for pretrain_file in self.pretrain_files:
            print("Pretrain using file:{}".format(pretrain_file))
            loader = DataPiper(pretrain_file,logger)
            for (idx, fold_num, X, labels, weights) in self.get_data(loader):
                self.do_thread_execute(fit_batch,self.clf,X,labels,weights)
        """

        print("Train with file={}".format(self.train_file))
        rcount = 0
        loader = DataPiper(self.train_file,logger)
        loops = 0
        for (idx, fold_num, X, labels, weights) in self.get_data(loader):
            if loops % 2 == 0:
                self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights, do_free=False)
            loops += 1
            rcount += len(labels)

            print("Training", rcount, time.time() - start_time)
            self.do_thread_execute(fit_batch,self.clf,X,labels,weights)

    def train_cv(self):
        start_time = time.time()

        nfold = 4
        train_preds = []
        auc_cv = [0.0 for _ in range(nfold)]
        for fold in range(nfold):
            self.create_clf()
            print("Pretrain models")
            self.pretrain()
            """
            for pretrain_file in self.pretrain_files:
                print("Pretrain using file:{}".format(pretrain_file))
                loader = DataPiper(pretrain_file,logger)
                for (idx, fold_num, X, labels, weights) in self.get_data(loader):
                    self.do_thread_execute(fit_batch,self.clf,X,labels,weights)
            """
            print("Train with file={}".format(self.train_file))
            file_size = 40000000
            all_cv_preds = np.zeros(shape=(file_size,),dtype=np.float32)
            loader = DataPiper(self.train_file,logger)
            valid_datas = []
            loops = 0
            rcount = 0
            for (idx, fold_num, X, labels, weights) in self.get_data(loader,fold=nfold,file_size=file_size):
                print("fold_num={},fold={},nfold={}".format(fold_num,fold,nfold))
                if fold_num == fold:
                    valid_datas.append((idx,fold_num,X,labels,weights))
                    print("Add valid_datas:len={}".format(len(valid_datas)))
                    continue

                loops += 1
                rcount += len(labels)
                if loops % 2 == 0:
                    self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights,do_free=False)

                print("Training", rcount, time.time() - start_time)
                self.do_thread_execute(fit_batch,self.clf,X,labels,weights)

            print("Predict for the validation data")
            print("Valid_datas:len={}".format(len(valid_datas)))
            valid_start_idx = valid_datas[0][0]
            valid_labels = []
            valid_weights = []
            valid_ds = []
            for d in valid_datas:
                valid_labels.append(d[3])
                valid_weights.append(d[4])
                valid_ds.append(d[2])
                #print("Valid_ds:d.len={},valid_ds.len={}".format(len(d[2]),len(valid_ds)))
            num = len(valid_labels)
            if num > 1:
                valid_weights = np.concatenate(valid_weights,axis=0)
                valid_labels = np.concatenate(valid_labels, axis=0)
                from scipy.sparse import hstack
                #valid_ds = np.concatenate(valid_ds,axis=0)
                valid_ds = hstack(valid_ds,axis=0)
            else:
                valid_labels = valid_labels[0]
                valid_weights = valid_weights[0]
                valid_ds = valid_ds[0]
            y_pred = self.do_thread_execute(predict_batch,self.clf,valid_ds)
            num = len(valid_labels)
            y_pred = np.reshape(y_pred,(num,))
            print("y_pred.shape={}".format(y_pred.shape))
            print("valid_labels.shape={}".format(valid_labels.shape))
            valid_labels = np.reshape(valid_labels,(num,))
            train_preds.append((valid_start_idx,num,y_pred))
            auc_cv[fold] = dist_utils._auc(valid_labels, y_pred)
            logger.info("      {:>3}    {:>8}    {} x {}".format(
                fold+1, np.round(auc_cv[fold],6), valid_ds.shape[0], valid_ds.shape[1]))

            #clean up
            del(valid_datas)
            del(valid_ds)
            del(valid_labels)
            del(valid_weights)
            gc.collect()

        # Save cv result data
        fname = "%s/cv_pred_%s_%s.csv"%(config.OUTPUT_DIR, "fmftrl",Ver)
        print("Save cv predictions:{}".format(fname))
        df = pd.DataFrame({"predicted": all_cv_preds})
        df.to_csv(fname, index=False, columns=["predicted"])
Beispiel #3
0
    #del(X_description)
    del(X_description1)
    del(X_description2)
    del(X_description3)
    print(train_X.shape)
    print(valid_X.shape)
    print('[{}] addition feature completed.'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.03, beta=0.01, L1=0.001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.07, L2_fm=0.001, init_fm=0.01,
                        D_fm=400, e_noise=0.0001, iters=1, inv_link="identity", threads=4,weight_fm = 1.0)
    for i in range(4):
        model.fit(train_X, train_y)
        if debug:
            preds = model.predict(X=valid_X)
            print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
        param = model.__getstate__()
        model.__setstate__((param[0],param[1],param[2],param[3],param[4] * 0.8,param[5],param[6],param[7],
            param[8],param[9],param[10],param[11],param[12],param[13],param[14],param[15],param[16],param[17]
            , param[18],param[19]))
    if debug:
        resdefm = preds
    resf = model.predict(X=X_test)
    res2.extend(resf)
    del(param)
    del(model)
    del(sparse_merge)
    del(train_X)
    del(train_y)
    del(valid_X)
    del(valid_y)
    gc.collect()