class vanila_FM_FTRL_Regressor: def __init__(self, param_dict, D): alpha = param_dict['alpha'] beta = param_dict['beta'] L1 = param_dict['L1'] L2 = param_dict['L2'] alpha_fm = param_dict['alpha_fm'] init_fm = param_dict['init_fm'] D_fm = param_dict['D_fm'] e_noise = param_dict['e_noise'] iters = param_dict['iters'] self.model = FM_FTRL(alpha=alpha, beta=beta, L1=L1, L2=L2, D=D, alpha_fm=alpha_fm, L2_fm=0.0, init_fm=init_fm, D_fm=D_fm, e_noise=e_noise, iters=iters, inv_link="identity", threads=THREAD) def fit(self, X_train, y_train): self.model.fit(X_train, y_train) def predict(self, X_test): return self.model.predict(X_test)
def train(self, tag): print("FM_FTRL training") train_data, train_labels, dev_data, dev_labels = self.prepare_train_data( tag, 'FM_FTRL') train_data = preprocessing.scale(train_data, with_mean=False) dev_data = preprocessing.scale(dev_data, with_mean=False) self.clf = FM_FTRL( alpha=0.001, # w0和w的FTRL超参数alpha beta=0.01, # w0和w的FTRL超参数beta L1=0.00001, # w0和w的L1正则 L2=0.1, # w0和w的L2正则 D=train_data.shape[1], alpha_fm=0.001, # v的FTRL超参数alpha L2_fm=0.0, # v的L2正则 init_fm=0.01, D_fm=200, e_noise=0.0001, # iters=5, inv_link="identity", threads=7, ) self.clf.fit(train_data, train_labels) y_train = self.clf.predict(train_data) y_val = self.clf.predict(dev_data) print('train_logloss: ' + str(log_loss(train_labels, y_train))) print("val_logloss: " + str(log_loss(dev_labels, y_val))) print("train_auc: " + str(roc_auc_score(train_labels, y_train))) print("val_auc: " + str(roc_auc_score(dev_labels, y_val)))
def test(self, name): print("FM_FTRL testing...") train_data, train_labels, test_data = self.prepare_test_data( name, 'FM_FTRL') self.clf = FM_FTRL( alpha=0.01, # w0和w的FTRL超参数alpha beta=0.01, # w0和w的FTRL超参数beta L1=0, # w0和w的L1正则 L2=0, # w0和w的L2正则 D=train_data.shape[1], alpha_fm=0.005, # v的FTRL超参数alpha L2_fm=0.01, # v的L2正则 init_fm=0.01, D_fm=2, e_noise=0.0001, iters=3, inv_link="sigmoid", threads=7, ) self.clf.fit(train_data, train_labels) submit = self.clf.predict(test_data) with open( config.output_prefix_path + 'FM_FTRL_' + name + '-summit.txt', 'w') as fr: for sub in submit: fr.write(str(sub) + '\n')
def trainFMFTRL(): # Load Data dftrain, X_train, y_train, moddict = prepSparseTrain({}, trn_file, trnidx) dfvalid, X_valid = prepSparseTest(moddict, trn_file, validx) # Train the model modelfm = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X_train.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. threshold = .0002 for i in range(15): modelfm.fit(X_train , dftrain.target.values, verbose=1) predsfm = modelfm.predict(X=X_valid) score_ = rmsle(np.expm1(dfvalid.target.values), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ + threshold < baseline: baseline = score_ else: break # 0.42919 with zeros in val # 0.42160 removing zeros in val # X_train.shape = (1333501, 1902850) # ('FM_FTRL dev RMSLE:', 0.42850571762280409) # Reduce the number of columns keep_cols = ['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name', 'price', \ 'shipping', 'item_description', 'target', 'general_cat', 'subcat_1', 'subcat_2'] dftrain, dfvalid = dftrain[keep_cols], dfvalid[keep_cols] return dftrain, dfvalid, y_train, moddict, modelfm, predsfm
def fit(self, X_train=None, X_val=None, y_train=None, y_val=None): if X_val is not None: return NotImplementedError( 'No validation is possible in fit for now.') models = [] for class_ix, class_name in enumerate(self.config.LIST_CLASSES): y_train_i = y_train[:, class_ix] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=30.0, D=X_train.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, D_fm=200, e_noise=0.0, iters=self.config.ITERATIONS, inv_link="identity", threads=30) model.fit(X_train, y_train_i) models.append(model) return models
def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch(normalize_text, tokenizer=Tokenizer(stemmer=stemmer), extractor=WordHash(decode_error='ignore', n_features=2**25, ngram_range=(1, 2), norm='l2'), batcher=batcher) self.clf = FM_FTRL(D=2**25, D_fm=4, iters=1, inv_link="identity", threads=multiprocessing.cpu_count() // 2) if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def make_fmftrl_predictions(X_train, X_test, y): model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X_train.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link='identity', threads=4) model.fit(X_train, y, verbose=1) y_pred = model.predict(X_test) return y_pred
def __init__(self, pickle_model="", datadir=None): self.wb = wordbatch.WordBatch(normalize_text, stemmer=stemmer, extractor=(WordHash, { "decode_error": 'ignore', "n_features": 2**25, "non_negative": False, "ngram_range": (1, 2), "norm": 'l2' })) self.clf = FM_FTRL(D=2**25, D_fm=4, iters=1, inv_link="identity", threads=multiprocessing.cpu_count() // 2) if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def _build(self): D_fm = self.config['D_fm'] iters = self.config['iters'] e_clip = self.config['e_clip'] alpha_fm = self.config['alpha_fm'] weight_fm = self.config['weight_fm'] threads = 8 clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=alpha_fm, L2_fm=0.0, init_fm=0.01, weight_fm=weight_fm, D_fm=D_fm, e_noise=0.0, iters=iters, inv_link="sigmoid", e_clip=e_clip, threads=threads, use_avx=1, verbose=0) self.model = clf
def runFM(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): class_weights = { 'toxic': 1.0, 'severe_toxic': 0.2, 'obscene': 1.0, 'threat': 0.1, 'insult': 0.8, 'identity_hate': 0.2 } model = FM_FTRL(alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_X.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) train_weight = np.array( [1.0 if x == 1 else class_weights[label] for x in train_y]) model.fit(train_X, train_y, train_weight, reset=False) pred_test_y = sigmoid(model.predict(test_X)) pred_test_y2 = sigmoid(model.predict(test_X2)) return pred_test_y, pred_test_y2
def __init__(self, param_dict, D): alpha = param_dict['alpha'] beta = param_dict['beta'] L1 = param_dict['L1'] L2 = param_dict['L2'] alpha_fm = param_dict['alpha_fm'] init_fm = param_dict['init_fm'] D_fm = param_dict['D_fm'] e_noise = param_dict['e_noise'] iters = param_dict['iters'] self.model = FM_FTRL(alpha=alpha, beta=beta, L1=L1, L2=L2, D=D, alpha_fm=alpha_fm, L2_fm=0.0, init_fm=init_fm, D_fm=D_fm, e_noise=e_noise, iters=iters, inv_link="identity", threads=THREAD)
def runFM(train_X, train_y, test_X, test_y, test_X2, params): params['D'] = train_X.shape[1] rounds = params.pop('rounds') model = FM_FTRL(**params) print_step('Fit FM') for i in range(rounds): model.fit(train_X, train_y, reset=False) pred_test_y = model.predict(test_X) print_step('Iteration {}/{} -- RMSE: {}'.format(i + 1, rounds, rmse(pred_test_y, test_y))) print_step('FM Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
def create_clf(self): if self.clf is not None: del (self.clf) gc.collect() self.clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=16, e_noise=0.0, iters=5, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0)
"binary": True }), minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=8, e_noise=0.0, iters=2, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) dtypes = { 'ip': 'uint32', 'app': 'uint16', 'device': 'uint16', 'os': 'uint16', 'channel': 'uint16',
class WBFmFtrlModel(object): wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True}) , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) #clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, # D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def __init__(self,pretrain_files,train_file, test_file): self.pretrain_files = pretrain_files self.train_file = train_file self.test_file = test_file self.clf = None self.pretrain_model_fn = "wb_fmftrl_v26_pretrain.model" def create_clf(self): if self.clf is not None: del(self.clf) gc.collect() self.clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=16, e_noise=0.0, iters=5, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def get_data(self, loader, fold= -1, chunk_size=10000000, file_size=40000000): if fold > 0: size_per_fold = int(file_size/fold) else: size_per_fold = chunk_size for (idx, df) in loader.get_chunk_data(): data = df[predictors].values labels = df['click_id'].values weights = df['weight'].values if fold == -1: fold_num = -1 else: fold_num = int(idx / size_per_fold) del(df) gc.collect() str_array = df2csr(data) X = self.wb.transform(str_array) del(str_array) del(data) gc.collect() yield (idx, fold_num, X, labels, weights) def do_thread_execute(self,target,clf, X, labels=None, weights=None,do_free=True): #str_array = df2csr(data) #gc.collect() #X = self.wb.transform(str_array) if labels is not None: args = (clf, X, labels, weights) else: args = (clf, X) p = ThreadWithReturnValue(target=target,args =args) p.start() ret = p.join() if do_free: del(X) if labels is not None: del(labels) if weights is not None: del(weights) gc.collect() return ret def predict(self,predict_file): test_preds = [] click_ids = [] test_loader = DataPiper(predict_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(test_loader): click_ids+= labels.tolist() test_preds += list(self.do_thread_execute(predict_batch,self.clf,X)) return click_ids, test_preds def predict_data(self, X, labels, weights): return predict_batch(self.clf, X) def pretrain(self): p = None X = None rcount = 0 start_time = time.time() self.create_clf() if not os.path.exists(self.pretrain_model_fn): print("Pretrain the model") for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) with open(self.pretrain_model_fn,"wb") as f: params = self.clf.__getstate__() #self.create_clf() pkl.dump(params,f) #self.clf.pickle_model(self.pretrain_model_fn) else: with open(self.pretrain_model_fn,"rb") as f: params = pkl.load(f) self.clf.__setstate__(params) #self.clf.unpickle_model(self.pretrain_model_fn) def train_all(self): p = None X = None rcount = 0 start_time = time.time() self.create_clf() print("Pretrain the model") self.pretrain() """ for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) """ print("Train with file={}".format(self.train_file)) rcount = 0 loader = DataPiper(self.train_file,logger) loops = 0 for (idx, fold_num, X, labels, weights) in self.get_data(loader): if loops % 2 == 0: self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights, do_free=False) loops += 1 rcount += len(labels) print("Training", rcount, time.time() - start_time) self.do_thread_execute(fit_batch,self.clf,X,labels,weights) def train_cv(self): start_time = time.time() nfold = 4 train_preds = [] auc_cv = [0.0 for _ in range(nfold)] for fold in range(nfold): self.create_clf() print("Pretrain models") self.pretrain() """ for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) """ print("Train with file={}".format(self.train_file)) file_size = 40000000 all_cv_preds = np.zeros(shape=(file_size,),dtype=np.float32) loader = DataPiper(self.train_file,logger) valid_datas = [] loops = 0 rcount = 0 for (idx, fold_num, X, labels, weights) in self.get_data(loader,fold=nfold,file_size=file_size): print("fold_num={},fold={},nfold={}".format(fold_num,fold,nfold)) if fold_num == fold: valid_datas.append((idx,fold_num,X,labels,weights)) print("Add valid_datas:len={}".format(len(valid_datas))) continue loops += 1 rcount += len(labels) if loops % 2 == 0: self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights,do_free=False) print("Training", rcount, time.time() - start_time) self.do_thread_execute(fit_batch,self.clf,X,labels,weights) print("Predict for the validation data") print("Valid_datas:len={}".format(len(valid_datas))) valid_start_idx = valid_datas[0][0] valid_labels = [] valid_weights = [] valid_ds = [] for d in valid_datas: valid_labels.append(d[3]) valid_weights.append(d[4]) valid_ds.append(d[2]) #print("Valid_ds:d.len={},valid_ds.len={}".format(len(d[2]),len(valid_ds))) num = len(valid_labels) if num > 1: valid_weights = np.concatenate(valid_weights,axis=0) valid_labels = np.concatenate(valid_labels, axis=0) from scipy.sparse import hstack #valid_ds = np.concatenate(valid_ds,axis=0) valid_ds = hstack(valid_ds,axis=0) else: valid_labels = valid_labels[0] valid_weights = valid_weights[0] valid_ds = valid_ds[0] y_pred = self.do_thread_execute(predict_batch,self.clf,valid_ds) num = len(valid_labels) y_pred = np.reshape(y_pred,(num,)) print("y_pred.shape={}".format(y_pred.shape)) print("valid_labels.shape={}".format(valid_labels.shape)) valid_labels = np.reshape(valid_labels,(num,)) train_preds.append((valid_start_idx,num,y_pred)) auc_cv[fold] = dist_utils._auc(valid_labels, y_pred) logger.info(" {:>3} {:>8} {} x {}".format( fold+1, np.round(auc_cv[fold],6), valid_ds.shape[0], valid_ds.shape[1])) #clean up del(valid_datas) del(valid_ds) del(valid_labels) del(valid_weights) gc.collect() # Save cv result data fname = "%s/cv_pred_%s_%s.csv"%(config.OUTPUT_DIR, "fmftrl",Ver) print("Save cv predictions:{}".format(fname)) df = pd.DataFrame({"predicted": all_cv_preds}) df.to_csv(fname, index=False, columns=["predicted"])
def trainFMFTRL(moddict): merge = pd.read_csv(trn_file, sep='\t', encoding='utf-8') mergetst = pd.read_csv(tst_file, sep='\t', encoding='utf-8') #test = pd.read_csv(tst_file, sep='\t', encoding='utf-8') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', merge.shape) dftt = merge[(merge.price < 1.0)] merge = merge.drop(merge[(merge.price < 1.0)].index) del dftt['price'] nrow_train = merge.shape[0] # print(nrow_train, nrow_test) y = np.log1p(merge["price"]) merge = pd.concat([merge, dftt]) merge['target'] = np.log1p(merge["price"]) #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(merge[:nrow_train].shape[0]), random_state=233, train_size=0.90) gc.collect() cpuStats() merge = prepFMFeatures(merge) mergetst = prepFMFeatures(mergetst) cpuStats() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] moddict['cross_cols'] = {} for i in range(0, len(cross_nm)): moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True) moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]]) if i == 0: x_col = moddict['cross_cols'][cross_nm[i]].transform( merge[cross_nm[i]]) else: x_col = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( merge[cross_nm[i]]))) del merge[cross_nm[i]] gc.collect() cpuStats() ''' Test Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] for i in range(0, len(cross_nm)): if i == 0: x_coltst = moddict['cross_cols'][cross_nm[i]].transform( mergetst[cross_nm[i]]) else: x_coltst = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( mergetst[cross_nm[i]]))) del mergetst[cross_nm[i]] gc.collect() cpuStats() ''' Hash name ''' moddict['wb_name'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, 'verbose': 1, }), procs=8) moddict['wb_name'].dictionary_freeze = True X_name = moddict['wb_name'].fit_transform(merge['name']) moddict['wb_name_mask'] = np.where( X_name[:nrow_train].getnnz(axis=0) > 0)[0] X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' test Hash name ''' X_name = moddict['wb_name'].transform(mergetst['name']) X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category #2 ''' moddict['wb_cat'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=4) moddict['wb_cat'].dictionary_freeze = True ### This must be the full dataset #cats = merge["category_name"].str.replace('/', ' ').unique() moddict['wb_cat'].fit(categories) X_cat_tmp = moddict['wb_cat'].transform(categories) moddict['wb_cat_dict'] = dict([ (c, X_cat_tmp.getrow(row)) for (c, row) in zip(categories.tolist(), range(len(categories))) ]) X_cat = vstack(([ moddict['wb_cat_dict'][c] for c in merge["category_name"].str.replace('/', ' ') ])) #moddict['wb_cat_mask'] = np.array(np.clip(X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) moddict['wb_cat_mask'] = np.where(X_cat[:nrow_train].getnnz(axis=0) > 0)[0] X_cat = X_cat[:, moddict['wb_cat_mask']] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' moddict['wb_cat_ctgc'] = CountVectorizer() moddict['wb_cat_ctgc'].fit(merge['general_cat']) X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat']) moddict['wb_cat_ctsc1'] = CountVectorizer() moddict['wb_cat_ctsc1'].fit(merge['subcat_1']) X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1']) moddict['wb_cat_ctsc2'] = CountVectorizer() moddict['wb_cat_ctsc2'].fit(merge['subcat_2']) X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 0.6], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None }), procs=8) moddict['wb_dscr'].dictionary_freeze = True X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' + merge['item_description']) #moddict['wb_dscr_mask'] = np.array(np.clip(X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) moddict['wb_dscr_mask'] = np.where( X_description[:nrow_train].getnnz(axis=0) > 1)[0] X_description = X_description[:, moddict['wb_dscr_mask']] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) moddict['wb_brandname'] = LabelBinarizer(sparse_output=True) moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train]) X_brand = moddict['wb_brandname'].transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True) moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train]) X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id']) print('[{}] Label binarize `item_condition_id` completed.'.format( time.time() - start_time)) moddict['wb_shipping'] = LabelBinarizer(sparse_output=True) moddict['wb_shipping'].fit(merge['shipping'][:nrow_train]) X_shipping = moddict['wb_shipping'].transform(merge['shipping']) print('[{}] Label binarize `shipping` completed.'.format(time.time() - start_time)) print( X_itemcond.shape, X_shipping.shape, #X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack(( X_itemcond, X_shipping, #X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) print(50 * '-') cpuStats() print(50 * '-') # Remove features with document frequency <=1 print(sparse_merge.shape) gc.collect() sparse_merge, y = sparse_merge[:nrow_train], y[:nrow_train] if develop: train_X, valid_X, train_y, valid_y = sparse_merge[trnidx], \ sparse_merge[validx], \ y.values[trnidx], y.values[validx] del sparse_merge gc.collect() print(50 * '*') cpuStats() print(50 * '*') print(train_X.shape[1]) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=train_X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=4) #iters=15 print(50 * '|') cpuStats() print(50 * '|') baseline = 1. for i in range(15): print(50 * '-') cpuStats() print(50 * '-') model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline - 0.0004: baseline = score_ else: break moddict['FMmodel'] = model print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = moddict['FMmodel'].predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) gc.collect() return merge, moddict, trnidx, validx, nrow_train, predsfm
test_features = hstack( [ test_char_features, test_word_features, tst_str_array ] ).tocsr() del test_word_features, tst_str_array, test_char_features gc.collect() print("Fin Stack test") with timer("Scoring FM FTRL"): clf = FM_FTRL( alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_features.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=5, inv_link="identity", e_clip=1.0, threads=1, use_avx=1, verbose=1 ) clf.fit(train_features, labels) train_pred = clf.predict(train_features) pred = clf.predict(test_features) score = sqrt(mean_squared_error(labels, train_pred)) print("FINAL RMSE {}".format(score)) sub = pd.read_csv('../input/sample_submission.csv') sub['deal_probability'] = pred sub['deal_probability'].clip(0.0, 1.0, inplace=True) print("Output Prediction CSV") sub.to_csv('subm/wordbatch_fmtrl_submission.csv', index=False)
L1=0.00001, L2=1.0, D=X.shape[1], iters=50, inv_link="identity", threads=1) FTRL_model.fit(X, y) print("[{}] Train FTRL completed".format(time.time() - start_time)) FM_FTRL_model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4) FM_FTRL_model.fit(X, y) print("[{}] Train FM FTRL completed".format(time.time() - start_time)) # Remove features with document frequency <=100 print("Before removing features with document frequency <=100:", X.shape) mask = np.array(np.clip(X.getnnz(axis=0) - 100, 0, 1), dtype=bool) X = X[:, mask] print("After removing features with document frequency <=100:", X.shape)
gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, train_size=0.90, random_state=233) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 8 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Regex characteristics - carat, gb/tb, cpu ''' def count_rgx(regexls, idx_, filter_=None): colvals = merge['name'][idx_] + ' ' + merge['item_description'][idx_] vals = pd.Series(np.zeros(len(colvals))) for rgx_ in regexls: valsls = colvals.str.findall(rgx_, re.IGNORECASE) vals[vals == 0] += pd.Series([ int(v[0]) if len(set(v)) == 1 else 0 for v in valsls ])[vals == 0] if filter_: vals[~vals.isin(filter_)] = 0. return vals def count_rgx_name(regexls, idx_, filter_=None): colvals = merge['name'][idx_] vals = pd.Series(np.zeros(len(colvals))) for rgx_ in regexls: valsls = colvals.str.findall(rgx_, re.IGNORECASE) vals[vals == 0] += pd.Series( [int(v[0]) if len(v) != 0 else 0 for v in valsls])[vals == 0] if filter_: vals[~vals.isin(filter_)] = 0. return vals # gold measures = np.zeros((merge.shape[0], 4)) ix_chk = ((merge.name.str.contains('gold', case=False)) | \ (merge.item_description.str.contains('gold', case=False))) & \ (merge['subcat_1'] == 'Jewelry') rgxls = [ r"(\d+)k ", r"(\d+)kt ", r"(\d+)k.", r"(\d+)kt.", r"(\d+)k,", r"(\d+)kt,", r"(\d+) k ", r"(\d+) kt", r"(\d+) k.", r"(\d+) kt.", r"(\d+) k,", r"(\d+) kt," ] measures[ix_chk, 0] = count_rgx(rgxls, ix_chk, filter_=[10, 12, 14, 16, 18, 20, 21, 22, 23, 24]) # phone memory ix_chk = (merge['subcat_2'] == 'Cell Phones & Smartphones') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 1] = count_rgx(rgxls, ix_chk) # console memory ix_chk = (merge['subcat_2'] == 'Consoles') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 2] = count_rgx(rgxls, ix_chk) # computer memory ix_chk = (merge['category_name'] == 'Electronics/Computers & Tablets/Laptops & Netbooks') | \ (merge['category_name'] == 'Electronics/Computers & Tablets/Desktops & All-In-Ones') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 3] = count_rgx(rgxls, ix_chk) # cpu # oz # diamond #r"(\d+) karat ", r"(\d+) carat " ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: lb = LabelBinarizer(sparse_output=True) if 'X_orig' not in locals(): X_orig = lb.fit_transform(merge[col].apply(hash)) else: X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['item_description']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['subcat_2']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']+merge['item_description']).apply(hash)))) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 2, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' gc.collect() cpuStats() ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) X_memory = lb.fit_transform(merge['measure_memory']) mask = np.array(np.clip(X_memory.getnnz(axis=0) - 10**6, 1, 0), dtype=bool) X_memory = X_memory[:, mask] X_gold = lb.fit_transform(merge['measure_gold']) mask = np.array(np.clip(X_gold.getnnz(axis=0) - 10**6, 1, 0), dtype=bool) X_gold = X_gold[:, mask] print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_orig.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_orig)).tocsr() ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_memory, X_gold) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_memory, X_gold)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 # 0.419741 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
(X_train_dummies, X_train_description, X_train_brand, X_train_category1, X_train_category2, X_train_category3, X_train_name)).tocsr() mask3 = np.where(X_train.getnnz(axis=0) > 1)[0] X_train = X_train[:, mask3] d = X_train.shape[1] del X_train_dummies, X_train_description, X_train_brand, X_train_category1, X_train_category2, X_train_category3, X_train_name gc.collect() FM_model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=d, alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) FM_model.fit(X_train, Y_train) print("FM_FTRL training completed!") del d, X_train, Y_train gc.collect() def prediction_process(test, le1=le1,
gc.collect() print("Fin Stack test") with timer("Scoring FM FTRL"): kf = KFold(n_splits=5, random_state=42, shuffle=True) model = FM_FTRL( alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_features.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=7, #5 inv_link="identity", e_clip=1.0, threads=6, use_avx=1, verbose= 1 #inv_link="identity", e_clip=1.0, threads=1, use_avx=1, verbose=1 ) fold_id = -1 X = train_features y = labels x_test = test_features
class WBFmFtrlModel(object): wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True}) , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def __init__(self,train_files): self.train_files = train_files def predict(self,predict_file): p = None test_preds = [] click_ids = [] X = None for df_c in pd.read_csv(predict_file,engine='c',chunksize=batchsize,sep=","): str_array, labels, weights = df2csr(self.wb,df_c) click_ids+= df_c['click_id'].tolist() del(df_c) if p != None: test_preds += list(p.join()) if X is not None: del (X) X = None gc.collect() X = self.wb.transform(str_array) del (str_array) p = ThreadWithReturnValue(target=predict_batch, args=(self.clf, X)) p.start() if p != None: test_preds += list(p.join()) del(X) return click_ids, test_preds def train(self): p = None X = None rcount = 0 for train_file in self.train_files: print("Train using file:{}".format(train_file)) for df_c in pd.read_csv(train_file, engine='c', chunksize=batchsize, #for df_c in pd.read_csv('../input/train.csv', engine='c', chunksize=batchsize, sep=",", dtype=dtypes): rcount += len(df_c) #cpuStats() str_array, labels, weights= df2csr(self.wb, df_c, pick_hours={4, 5, 10, 13, 14}) del(df_c) if p != None: p.join() if X is not None: del(X) X = None gc.collect() X= self.wb.transform(str_array) del(str_array) if rcount % (2 * batchsize) == 0: if p != None: p.join() p = threading.Thread(target=evaluate_batch, args=(self.clf, X, labels, rcount)) p.start() print("Training", rcount, time.time() - start_time) cpuStats() if p != None: p.join() p = threading.Thread(target=fit_batch, args=(self.clf, X, labels, weights)) p.start() if p != None: p.join() del(X) X = None
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 4 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ''' ix = (merge['brand_name']==merge['brand_name']) & \ (~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' +merge['name'][ix] ''' #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: wb = CountVectorizer() if 'X_orig' not in locals(): X_orig = wb.fit_transform(merge[col]) else: X_orig = hstack((X_orig, wb.fit_transform(merge[col]))) print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 3, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 100, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' ''' Stemmer ''' # https://github.com/skbly7/usefulness/blob/ed11cd55080d553cf62873999a5e00b154057fbc/textpreprocess.py from nltk.tokenize import WordPunctTokenizer # This is better for sentences containing unicode, like: u"N\u00faria Espert" word_tokenize = WordPunctTokenizer().tokenize import Stemmer import string ps = Stemmer.Stemmer("english") _wsre = re.compile("\s+") _alphanumre = re.compile("[\w\-\' ]", re.UNICODE) def _removestopwords(txtwords): global stoplist # stoplist = stopwords.words("english") if stoplist is None: stoplist = frozenset([string.strip(l) for l in open(STOPFILE).readlines()]) return [[w for w in t if w not in stoplist] for t in txtwords] def _stem(txtwords): return [stemmer.stemWords(t) for t in txtwords] def _removenonalphanumericchars(txtwords): return [[string.join([c for c in w if _alphanumre.search(c) is not None], "") for w in t] for t in txtwords] def _stripallwhitespace(txts): return [_wsre.sub("", txt) for txt in txts] stemmer = Stemmer.Stemmer("english") def textpreprocess(txt, sentencetokenize=False, replacehyphenbyspace=True, wordtokenize=False, lowercase=True, stem=True, removenonalphanumericchars=True, stripallwhitespace=True): """ Note: For html2text, one could also use NCleaner (common.html2text.batch_nclean) Note: One could improve the sentence tokenization, by using the original HTML formatting in the tokenization. Note: We use the Porter stemmer. (Optimization: Shouldn't rebuild the PorterStemmer object each time this function is called.) """ if sentencetokenize: txts = nltk.word_tokenize(txt) #txts = tokenizer.tokenize(txt.split()) else: txts = txt.split() txt = None if replacehyphenbyspace: txts = [t.replace("-", " ") for t in txts] if wordtokenize: txtwords = [word_tokenize(t) for t in txts] else: txtwords = [string.split(t) for t in txts] txts = None if lowercase: txtwords = [[string.lower(w) for w in t] for t in txtwords] if stem: txtwords = _stem(txtwords) # TODO: Maybe remove Unicode accents? http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string if removenonalphanumericchars: txtwords = _removenonalphanumericchars(txtwords) txtwords = [[w for w in t if w != ""] for t in txtwords] txts = [string.join(words) for words in txtwords] if stripallwhitespace: for _ in range(2): txts = _stripallwhitespace(txts) return string.join(txts, sep=" ") print('[{}] Start stemming'.format(time.time() - start_time)) merge['stem_name'] = [textpreprocess(s) for s in merge["name"].values] print('[{}] Stemming completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2'] ) crossed_columns_d = cross_columns(x_cols) categorical_columns = list( merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print ('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash)))%D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del(lb) ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True X_name = wb.fit_transform(merge['name']) del(wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del(wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}) , procs=8) wb.dictionary_freeze= True X_description = wb.fit_transform(merge['item_description']) del(wb) X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_stem_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_stem_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[trnidx], y.values[validx] model = FM_FTRL(alpha=0.005, beta=0.005, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.005, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X , train_y , verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
submission[class_name] = 0.0 cv_scores = [] for n_fold in ifold: ind_val = np.where(val_flag[n_fold])[0].tolist() ind_trn = np.where(~val_flag[n_fold])[0].tolist() np.random.shuffle(ind_trn) clf = FM_FTRL(alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_features.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) clf.fit(train_features[ind_trn], train_target[ind_trn], train_weight[ind_trn], reset=False) class_pred[ind_val] = sigmoid( clf.predict(train_features[ind_val])) score = roc_auc_score(train_target[ind_val],
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 4 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**29, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] print(sparse_merge.shape) X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] mask = np.array(np.clip(X.getnnz(axis=0) - 1, 0, 1), dtype=bool) X = X[:, mask] X_test = X_test[:, mask] print(X.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
del (X_category3) del (X_name) del (merge) gc.collect() del (X_dummies) # del(X_description) del (X_description1) del (X_description2) del (X_description3) print(train_X.shape) print(valid_X.shape) print('[{}] addition feature completed.'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.03, beta=0.01, L1=0.001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.07, L2_fm=0.001, init_fm=0.01, D_fm=400, e_noise=0.0001, iters=1, inv_link="identity", threads=4, weight_fm=1.0) for i in range(4): model.fit(train_X, train_y) if debug: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) param = model.__getstate__() model.__setstate__((param[0], param[1], param[2], param[3], param[4] * 0.8, param[5], param[6], param[7], param[8], param[9], param[10], param[11], param[12], param[13], param[14], param[15], param[16], param[17] , param[18], param[19])) if debug: resdefm = preds resf = model.predict(X=X_test) res2.extend(resf)
folds = KFold(n_splits=4, shuffle=True, random_state=2) losses = [] losses_per_folds = np.zeros(folds.n_splits) submission = pd.DataFrame.from_dict({'id': test['id']}) for i_c, class_name in enumerate(class_names): class_pred = np.zeros(len(train)) train_target = train[class_name].values train_weight = np.array([1.0 if x==1 else class_weights[class_name] for x in train_target]) submission[class_name] = 0.0 cv_scores = [] for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_features)): clf = FM_FTRL( alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_features.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm= 50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx= 1, verbose=1 ) clf.fit(train_features[trn_idx], train_target[trn_idx], train_weight[trn_idx], reset=False) class_pred[val_idx] = sigmoid(clf.predict(train_features[val_idx])) score = roc_auc_score(train_target[val_idx], class_pred[val_idx]) cv_scores.append(score) losses_per_folds[n_fold] += score / len(class_names) submission[class_name] += sigmoid(clf.predict(test_features)) / folds.n_splits #Classifier chain. Order of classes not optimized train_features = csr_matrix(hstack([train_features, np.reshape(np.array( [0 if x<0.5 else 1 for x in class_pred]), (train.shape[0], 1))])) test_features = csr_matrix(hstack([test_features, np.reshape(np.array( [0 if x<0.5 else 1 for x in submission[class_name]]), (test.shape[0], 1))]))
def main(): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: ###train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') ###test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') train = pd.read_table('../input/train.tsv', engine='c') test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) submission = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) params = { 'learning_rate': 0.6, 'application': 'regression', 'max_depth': 4, 'num_leaves': 31, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.6, 'bagging_freq': 5, 'feature_fraction': 0.6, 'nthread': 4, 'min_data_in_leaf': 100, 'max_bin': 31 } # Remove features with document frequency <=100 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) d_train = lgb.Dataset(train_X, label=train_y) watchlist = [d_train] if develop: d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \ early_stopping_rounds=1000, verbose_eval=1000) if develop: preds = model.predict(valid_X) print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsL = model.predict(X_test) print('[{}] Predict LGB completed.'.format(time.time() - start_time)) preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5) submission['price'] = np.expm1(preds) submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
def wordbatch_algo(): import time # print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) train = pd.read_table('../input/train.tsv', engine='c') # Drop rows where price = 0 train = train[train.price != 0].reset_index(drop=True) print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) y = np.log1p(train["price"]) nrow_train = train.shape[0] # Training train['general_cat'], train['subcat_1'], train['subcat_2'] = \ zip(*train['category_name'].apply(lambda x: split_cat(x))) train.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(train) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(train) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(train) print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc = train['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name = train['name'].apply(lambda x: len(x)).as_matrix().reshape( -1, 1) print('[{}] Length of text completed.'.format(time.time() - start_time)) # Name wb_name = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb_name.dictionary_freeze = True wb_name.fit(train['name']) X_name = wb_name.transform(train['name']) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb_cat1 = CountVectorizer() wb_cat2 = CountVectorizer() wb_cat3 = CountVectorizer() wb_cat1.fit(train['general_cat']) wb_cat2.fit(train['subcat_1']) wb_cat3.fit(train['subcat_2']) X_category1 = wb_cat1.transform(train['general_cat']) X_category2 = wb_cat2.transform(train['subcat_1']) X_category3 = wb_cat3.transform(train['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb_desc = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb_desc.dictionary_freeze = True wb_desc.fit(train['item_description']) X_description = wb_desc.transform(train['item_description']) # X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) lb.fit(train['brand_name']) X_brand = lb.transform(train['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_cond, d_cond = fit_dummy(train['item_condition_id'].tolist()) X_ship, d_ship = fit_dummy(train['shipping'].tolist()) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) del train gc.collect() print(X_cond.shape, X_ship.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_cond, X_ship, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) del X_description, X_brand, X_category1, X_category2, X_category3, X_name gc.collect() # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] print(sparse_merge.shape) X = sparse_merge # --------------------------------------- # FM model fit train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=train_X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=FM_iter, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) print('-' * 20) if develop: preds = model.predict(X=valid_X) print("->>>> FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) # --------------------------------------- # FTRL model fit model2 = FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=1.0, D=train_X.shape[1], iters=FTRL_iter, inv_link="identity", threads=1) # del X; gc.collect() model2.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model2.predict(X=valid_X) print("->>>> FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) # Clear variables: del X, train_X, train_y, sparse_merge gc.collect() # --------------------------------------- # Testing by chunk print(' FM/FTRL: ...reading the test data...') predsFM = [] predsF = [] for test in load_test(): test['general_cat'], test['subcat_1'], test['subcat_2'] = \ zip(*test['category_name'].apply(lambda x: split_cat(x))) test.drop('category_name', axis=1, inplace=True) handle_missing_inplace(test) #print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(test) # print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(test) # print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc_test = test['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name_test = test['name'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_name_test = wb_name.transform(test['name']) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_category1_test = wb_cat1.transform(test['general_cat']) X_category2_test = wb_cat2.transform(test['subcat_1']) X_category3_test = wb_cat3.transform(test['subcat_2']) X_description_test = wb_desc.transform(test['item_description']) # X_description_test = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_brand_test = lb.transform(test['brand_name']) X_cond_test = transform_dummy(test['item_condition_id'].tolist(), d_cond) X_ship_test = transform_dummy(test['shipping'].tolist(), d_ship) X_test = hstack((X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, \ X_category2_test, X_category3_test, X_name_test)).tocsr() X_test = X_test[:, mask] # Clear variables: del X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, X_category2_test, X_category3_test, X_name_test del test gc.collect() predsFM_batch = model.predict(X_test) predsFM += np.array(predsFM_batch).flatten().tolist() predsF_batch = model2.predict(X_test) predsF += np.array(predsF_batch).flatten().tolist() print(np.array(predsFM)) print('-' * 20) print(np.array(predsF)) print('-' * 20) return np.array(predsFM), np.array(predsF)
"lowercase": False, "n_features": D, "norm": None, "binary": True }), minibatch_size=batchsize // 80, procs=8, verbose=0) clf = FM_FTRL( alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", threads=4, ) p = None rcount = 0 for df_c in pd.read_csv( './input/train.csv', engine='c', chunksize=batchsize, #for df_c in pd.read_csv('../input/train.csv', engine='c', chunksize=batchsize, sep=","):