del (X_description3) print(train_X.shape) print(valid_X.shape) print('[{}] addition feature completed.'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.03, beta=0.01, L1=0.001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.07, L2_fm=0.001, init_fm=0.01, D_fm=400, e_noise=0.0001, iters=1, inv_link="identity", threads=4, weight_fm=1.0) for i in range(4): model.fit(train_X, train_y) if debug: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) param = model.__getstate__() model.__setstate__((param[0], param[1], param[2], param[3], param[4] * 0.8, param[5], param[6], param[7], param[8], param[9], param[10], param[11], param[12], param[13], param[14], param[15], param[16], param[17] , param[18], param[19])) if debug: resdefm = preds resf = model.predict(X=X_test) res2.extend(resf) del (param) del (model) del (sparse_merge) del (train_X) del (train_y) del (valid_X) del (valid_y) gc.collect() resfm = np.array(res2)
class WBFmFtrlModel(object): wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True}) , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) #clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, # D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def __init__(self,pretrain_files,train_file, test_file): self.pretrain_files = pretrain_files self.train_file = train_file self.test_file = test_file self.clf = None self.pretrain_model_fn = "wb_fmftrl_v26_pretrain.model" def create_clf(self): if self.clf is not None: del(self.clf) gc.collect() self.clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=16, e_noise=0.0, iters=5, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def get_data(self, loader, fold= -1, chunk_size=10000000, file_size=40000000): if fold > 0: size_per_fold = int(file_size/fold) else: size_per_fold = chunk_size for (idx, df) in loader.get_chunk_data(): data = df[predictors].values labels = df['click_id'].values weights = df['weight'].values if fold == -1: fold_num = -1 else: fold_num = int(idx / size_per_fold) del(df) gc.collect() str_array = df2csr(data) X = self.wb.transform(str_array) del(str_array) del(data) gc.collect() yield (idx, fold_num, X, labels, weights) def do_thread_execute(self,target,clf, X, labels=None, weights=None,do_free=True): #str_array = df2csr(data) #gc.collect() #X = self.wb.transform(str_array) if labels is not None: args = (clf, X, labels, weights) else: args = (clf, X) p = ThreadWithReturnValue(target=target,args =args) p.start() ret = p.join() if do_free: del(X) if labels is not None: del(labels) if weights is not None: del(weights) gc.collect() return ret def predict(self,predict_file): test_preds = [] click_ids = [] test_loader = DataPiper(predict_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(test_loader): click_ids+= labels.tolist() test_preds += list(self.do_thread_execute(predict_batch,self.clf,X)) return click_ids, test_preds def predict_data(self, X, labels, weights): return predict_batch(self.clf, X) def pretrain(self): p = None X = None rcount = 0 start_time = time.time() self.create_clf() if not os.path.exists(self.pretrain_model_fn): print("Pretrain the model") for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) with open(self.pretrain_model_fn,"wb") as f: params = self.clf.__getstate__() #self.create_clf() pkl.dump(params,f) #self.clf.pickle_model(self.pretrain_model_fn) else: with open(self.pretrain_model_fn,"rb") as f: params = pkl.load(f) self.clf.__setstate__(params) #self.clf.unpickle_model(self.pretrain_model_fn) def train_all(self): p = None X = None rcount = 0 start_time = time.time() self.create_clf() print("Pretrain the model") self.pretrain() """ for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) """ print("Train with file={}".format(self.train_file)) rcount = 0 loader = DataPiper(self.train_file,logger) loops = 0 for (idx, fold_num, X, labels, weights) in self.get_data(loader): if loops % 2 == 0: self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights, do_free=False) loops += 1 rcount += len(labels) print("Training", rcount, time.time() - start_time) self.do_thread_execute(fit_batch,self.clf,X,labels,weights) def train_cv(self): start_time = time.time() nfold = 4 train_preds = [] auc_cv = [0.0 for _ in range(nfold)] for fold in range(nfold): self.create_clf() print("Pretrain models") self.pretrain() """ for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) """ print("Train with file={}".format(self.train_file)) file_size = 40000000 all_cv_preds = np.zeros(shape=(file_size,),dtype=np.float32) loader = DataPiper(self.train_file,logger) valid_datas = [] loops = 0 rcount = 0 for (idx, fold_num, X, labels, weights) in self.get_data(loader,fold=nfold,file_size=file_size): print("fold_num={},fold={},nfold={}".format(fold_num,fold,nfold)) if fold_num == fold: valid_datas.append((idx,fold_num,X,labels,weights)) print("Add valid_datas:len={}".format(len(valid_datas))) continue loops += 1 rcount += len(labels) if loops % 2 == 0: self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights,do_free=False) print("Training", rcount, time.time() - start_time) self.do_thread_execute(fit_batch,self.clf,X,labels,weights) print("Predict for the validation data") print("Valid_datas:len={}".format(len(valid_datas))) valid_start_idx = valid_datas[0][0] valid_labels = [] valid_weights = [] valid_ds = [] for d in valid_datas: valid_labels.append(d[3]) valid_weights.append(d[4]) valid_ds.append(d[2]) #print("Valid_ds:d.len={},valid_ds.len={}".format(len(d[2]),len(valid_ds))) num = len(valid_labels) if num > 1: valid_weights = np.concatenate(valid_weights,axis=0) valid_labels = np.concatenate(valid_labels, axis=0) from scipy.sparse import hstack #valid_ds = np.concatenate(valid_ds,axis=0) valid_ds = hstack(valid_ds,axis=0) else: valid_labels = valid_labels[0] valid_weights = valid_weights[0] valid_ds = valid_ds[0] y_pred = self.do_thread_execute(predict_batch,self.clf,valid_ds) num = len(valid_labels) y_pred = np.reshape(y_pred,(num,)) print("y_pred.shape={}".format(y_pred.shape)) print("valid_labels.shape={}".format(valid_labels.shape)) valid_labels = np.reshape(valid_labels,(num,)) train_preds.append((valid_start_idx,num,y_pred)) auc_cv[fold] = dist_utils._auc(valid_labels, y_pred) logger.info(" {:>3} {:>8} {} x {}".format( fold+1, np.round(auc_cv[fold],6), valid_ds.shape[0], valid_ds.shape[1])) #clean up del(valid_datas) del(valid_ds) del(valid_labels) del(valid_weights) gc.collect() # Save cv result data fname = "%s/cv_pred_%s_%s.csv"%(config.OUTPUT_DIR, "fmftrl",Ver) print("Save cv predictions:{}".format(fname)) df = pd.DataFrame({"predicted": all_cv_preds}) df.to_csv(fname, index=False, columns=["predicted"])
del(X_description2) del(X_description3) print(train_X.shape) print(valid_X.shape) print('[{}] addition feature completed.'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.03, beta=0.01, L1=0.001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.07, L2_fm=0.001, init_fm=0.01, D_fm=400, e_noise=0.0001, iters=1, inv_link="identity", threads=4,weight_fm = 1.0) for i in range(4): model.fit(train_X, train_y) if debug: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) param = model.__getstate__() model.__setstate__((param[0],param[1],param[2],param[3],param[4] * 0.8,param[5],param[6],param[7], param[8],param[9],param[10],param[11],param[12],param[13],param[14],param[15],param[16],param[17] , param[18],param[19])) if debug: resdefm = preds resf = model.predict(X=X_test) res2.extend(resf) del(param) del(model) del(sparse_merge) del(train_X) del(train_y) del(valid_X) del(valid_y) gc.collect()