def job(args, train_csv, test_csv, embeddings, cache): """ Reads data, makes preprocessing, trains model and records results. Gets args as argument and passes values of it's fields to functions.""" data = Data(train_csv, test_csv, cache) # read and preprocess data to_cache = not args.no_cache data.read_embedding(embeddings, args.unk_std, args.max_vectors, to_cache) data.preprocess(args.tokenizer, args.var_length) data.embedding_lookup() # split train dataset data_iter = data.split(args.kfold, args.split_ratio, args.stratified, args.test, args.seed) # iterate through folds loss_function = nn.BCEWithLogitsLoss() for fold, d in enumerate(data_iter): print(f'\n__________ fold {fold} __________') # get dataloaders if len(d) == 2: train, val = d test = data.test else: train, val, test = d dataloaders = iterate(train, val, test, args.batch_size) # train, val and test dataloader # choose model, optimizer, lr scheduler model = choose_model(args.model, data.text, args.n_layers, args.hidden_dim, args.dropout) optimizer = choose_optimizer(filter(lambda p: p.requires_grad, model.parameters()), args) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lrstep, gamma=0.1) learn = Learner(model, dataloaders, loss_function, optimizer, scheduler, args) learn.fit(args.epoch, args.n_eval, args.f1_tresh, args.early_stop, args.warmup_epoch, args.clip) # load best model learn.model, info = learn.recorder.load() # save val predictions y_pred, y_true, ids = learn.predict_probs() val_ids = [data.qid.vocab.itos[i] for i in ids] pred_to_csv(val_ids, y_pred, y_true) # choose best threshold for val predictions best_th, max_f1 = choose_thresh(y_pred, y_true, [0.1, 0.5, 0.01], message=True) learn.recorder.append_info({'best_th': best_th, 'max_f1': max_f1}) # predict test labels test_label, test_prob,_, test_ids, tresh = learn.predict_labels(is_test=True, thresh=args.f1_tresh) if args.test: test_loss, test_f1, _, _, _ = learn.evaluate(learn.test_dl, args.f1_tresh) learn.recorder.append_info({'test_loss': test_loss, 'test_f1': test_f1}, message='Test set results: ') # save test predictions to submission.csv test_ids = [data.qid.vocab.itos[i] for i in test_ids] submit(test_ids, test_label, test_prob) record_path = learn.recorder.record(fold) # directory path with all records print('\n') return record_path
def main(): submit = sys.argv[1] try: comment = sys.argv[2] utils.submit(file_path=submit, comment=comment) except IndexError: utils.submit(file_path=submit) shutil.move(submit, '../log_submit/')
def main(): COMMENT_TEXT_COL = 'comment_text' EMB_MAX_FEAT = 300 MAX_LEN = 220 MAX_FEATURES = 100000 #BATCH_SIZE = 1024 BATCH_SIZE = 256 #BATCH_SIZE = 2048 NUM_EPOCHS = 1 LSTM_UNITS = 64 if args.debug: print('running in debug mode') if args.debug: result_dir = os.path.join(utils.RESULT_DIR, 'debug-'+datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')) else: result_dir = os.path.join(utils.RESULT_DIR, datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')) os.mkdir(result_dir) print(f'created: {result_dir}') # convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch( # os.path.join(utils.BERT_MODEL_PATH, 'bert_model.ckpt'), # os.path.join(utils.BERT_MODEL_PATH, 'bert_config.json'), # utils.PYTORCH_BERT_MODEL_PATH) train_data = ToxicDataset(mode='train', debug=args.debug) test_data = ToxicDataset(mode='test') train, test = train_data.data, test_data.data train = utils.preprocess_data(train, mode='train') test = utils.preprocess_data(test) #tokenizer = Tokenizer(num_words=MAX_FEATURES, lower=True) tokenizer = BertTokenizer.from_pretrained(utils.BERT_MODEL_PATH, do_lower_case=True) X_train, X_test, y_train = utils.run_bert_tokenizer(tokenizer, train, test, seq_len=MAX_LEN) #word_index = tokenizer.word_index word_index = None #print(word_index) # print(f'vocab size: {len(word_index)}') # embedding_matrix = utils.build_embeddings(word_index, emb_max_feat=EMB_MAX_FEAT) # print(embedding_matrix.shape) embedding_matrix = None sub_preds, oof_df = utils.run_model_pytorch(result_dir, X_train, X_test, y_train, embedding_matrix, word_index, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, max_len=MAX_LEN, lstm_units=LSTM_UNITS, oof_df=train) bias_metrics_df = utils.compute_bias_metrics_for_model(dataset=oof_df, subgroups=utils.IDENTITY_COLS, model=utils.PREDICT_COL, label_col=utils.TOXICITY_COLUMN) validation_final_socre = utils.get_final_metric(bias_metrics_df, utils.calculate_overall_auc(oof_df, utils.TOXICITY_COLUMN) ) print(f'validation final score: {validation_final_socre}') utils.submit(result_dir, sub_preds) print('finish!!!')
def main(): # load feathers files = sorted(glob('../feats/*.feather')) df = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) df = df[configs['features']] feats = [f for f in df.columns if f not in FEATS_EXCLUDED] # load model reg = lgb.Booster(model_file='../output/lgbm_all_data.txt') # Recursive prediction print('Recursive prediction...') for day in tqdm(range(1914, 1914 + 28)): mask_test = (df['d_numeric'] >= day - 28) & (df['d_numeric'] <= day) tmp_df = df[mask_test] tmp_df = make_lags(tmp_df) df.loc[df['d_numeric'] == day, 'demand'] = reg.predict( tmp_df[tmp_df['d_numeric'] == day][feats], num_iteration=reg.best_iteration) del tmp_df gc.collect() # split test test_df = df[df['date'] >= '2016-04-25'] del df gc.collect() # reshape prediction for submit preds = test_df[['id', 'd', 'demand']].reset_index() preds = preds.pivot(index='id', columns='d', values='demand').reset_index() # split test1 / test2 preds1 = preds[['id'] + COLS_TEST1] preds2 = preds[['id'] + COLS_TEST2] # change column names preds1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] preds2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] # replace test2 id preds2['id'] = preds2['id'].str.replace('_validation', '_evaluation') # merge preds = preds1.append(preds2) # save csv preds.to_csv(submission_file_name, index=False) # submission by API submit(submission_file_name, comment='model301 recursive prediction')
def model_xgb(train , test , flag): train_x , train_y = train[0] , train[1] test_x , test_y = test[0] , test[1] print train_x.shape print train_y[2].shape print fea_names dtrain = xgb.DMatrix(train_x , label = train_y[2].values) dtest = xgb.DMatrix(test_x) def mapeobj(preds , dtrain): gaps = dtrain.get_label() delta = (preds - gaps) / gaps k = 6 e = np.exp(k * delta) grad = (e - 1 / e) / (e + 1 / e) for i , t in enumerate(delta): if t > 1: grad[i] = 1 elif t < -1: grad[i] = -1 grad = grad / gaps hess = (4 * k * gaps**2) / (e + 1 / e)**2 for i , t in enumerate(delta): if abs(t) > 1: hess[i] = 0 for i , g in enumerate(gaps): if g == 0: grad[i] = 0 hess[i] = 0 return grad , hess def evalmape(preds , dtrain): gaps = dtrain.get_label() errs = abs(gaps - preds) / gaps for i , g in enumerate(gaps): if g == 0: errs[i] = 0 err = np.mean(errs) return 'error' , err if flag == 'online': watchlist = [(dtrain , 'train')] m_xgb = xgb.train(Params.xgb_reg_params , dtrain , 800 , watchlist , mapeobj , evalmape) prd = m_xgb.predict(dtest) Util.submit(test_y.values , prd) elif flag == 'offline': dtest.set_label(test_y[2].values) watchlist = [(dtrain , 'train') , (dtest , 'eval')] #m_xgb = xgb.train(Params.xgb_reg_params , dtrain , 100 , watchlist) m_xgb = xgb.train(Params.xgb_reg_params , dtrain , 800 , watchlist , mapeobj , evalmape) prd = m_xgb.predict(dtest) #prd = postprocess(train , test_y.values , prd) print Util.score(test_y.values , prd) imp = m_xgb.get_fscore() print sorted(imp.items() , key = lambda d:d[1] , reverse = True)
def predict_test(self, method, thresh, method_params): y_preds = [] last_ids = None for pp in self.test_pred_paths: ids, y_prob, _ = load_pred_from_csv(pp) y_preds.append(y_prob) if last_ids: if np.array_equal(last_ids, ids): raise Exception( 'Prediction ids should be the same for ensemble') test_ens_prob = methods[method]( y_preds, args=method_params) # target probability after ensembling test_ens_label = (test_ens_prob > thresh).astype(int) submit(ids, test_ens_label, test_ens_prob)
def model_rf(train , test , flag): train_x , train_y = train[0] , train[1] test_x , test_y = test[0] , test[1] rf = RandomForestRegressor() rf.set_params(**Params.rf_reg_params) print "start training" rf.fit(train_x , train_y['gap'].values) if flag == 'online': prd = rf.predict(test_x) prd = postprocess(train , test_y.values , prd) Util.submit(test_y.values , prd) elif flag == 'offline': prd = rf.predict(test_x) prd = postprocess(train , test_y.values , prd) print 'test : ' , Util.score(test_y.values , prd) prd = rf.predict(train_x) print 'train : ' , Util.score(train_y.values , prd)
def submit_task(): start_ts = request.args.get('start_ts','') end_ts = request.args.get('end_ts','') submit_user = request.args.get('submit_user','') topic = request.args.get('topic','') status = submit(topic,start_ts,end_ts,submit_user) print status return json.dumps(status)
def submit_task(): start_ts = request.args.get('start_ts', '') end_ts = request.args.get('end_ts', '') submit_user = request.args.get('submit_user', '') topic = request.args.get('topic', '') status = submit(topic, start_ts, end_ts, submit_user) print status return json.dumps(status)
def main(): # load submission files sub = pd.read_csv('../output/submission_lgbm_diff.csv') # load train data df = pd.read_csv('../input/sales_train_validation.csv') # to cumsum sub['F1'] += df['d_1913'] sub.loc[:, 'F1'] = sub['F1'].where(sub['F1'] > 0, 0) for i in range(2, 29): sub[f'F{i}'] += sub[f'F{i-1}'] sub.loc[:, f'F{i}'] = sub[f'F{i}'].where(sub[f'F{i}'] > 0, 0) # save csv sub.to_csv(submission_file_name, index=False) # submission by API submit(submission_file_name, comment='model401 weekly prediction')
def model_rf(train , test , flag): train_x , train_y = train[0] , train[1] test_x , test_y = test[0] , test[1] if os.path.exists(configs['rf_model']): print "model exists" rf = joblib.load(configs['rf_model']) else: rf = RandomForestRegressor() rf.set_params(**Params.rf_reg_params) print "start training" rf.fit(train_x , train_y[3].values) joblib.dump(rf , configs['rf_model'] , compress=3) if flag == 'online': prd = rf.predict(test_x) #prd = postprocess(train , test_y.values , prd) Util.submit(test_y.values , prd) elif flag == 'offline': prd = rf.predict(test_x) #prd = postprocess(train , test_y.values , prd) print 'test : ', Util.score2(test_y.values , prd)
def output(train_df, test_df, models, model_params, feature_importance_df, train_preds, test_preds, scores, now, model_name): score = sum(scores) / len(scores) folder_path = make_output_dir(score, now, model_name) for i, m in enumerate(models): save2pkl('{0}/model_{1:0=2}.pkl'.format(folder_path, i), m) with open('{0}/model_params.json'.format(folder_path), 'w') as f: json.dump(model_params, f, indent=4) with open('{0}/model_valid_scores.json'.format(folder_path), 'w') as f: json.dump({i: s for i, s in enumerate(scores)}, f, indent=4) save_importances(feature_importance_df, '{}/importances.png'.format(folder_path), '{}/importance.csv'.format(folder_path)) # 以下の部分はコンペごとに修正が必要 submission_file_name = '{0}/submit_{1:%Y-%m-%d-%H-%M-%S}_{2}.csv'.format( folder_path, now, score) test_df.loc[:, 'target'] = test_preds test_df.loc[:, 'Outlier_Likelyhood'] = test_preds_bin q = test_df['Outlier_Likelyhood'].quantile(.9999) # 1.0930% test_df.loc[:, 'target'] = test_df['Outlier_Likelyhood'].apply( lambda x: 1 if x > q else x) test_df = test_df.reset_index() test_df[['card_id', 'target']].to_csv(submission_file_name, index=False) train_df.loc[:, 'OOF_PRED'] = train_preds train_df = train_df.reset_index() train_df[['card_id', 'OOF_PRED']].to_csv('{0}/oof.csv'.format(folder_path), ) # API経由でsubmit if not is_debug: submit(competition_name, submission_file_name, comment='user02 cv: %.6f' % score)
model.compile(Adam(1e-3), loss=['mse', 'mse'], loss_weights=[1, 1.7], metrics=[['acc'], []]) if os.path.exists(model_path): model.load_weights(model_path) print('\033[32;1mLoad Model\033[0m') plot_model(model, 'model.jpg') if training: checkpoint = ModelCheckpoint(model_path, 'val_loss', verbose=1, save_best_only=True, save_weights_only=True) reduce_lr = ReduceLROnPlateau('val_loss', 0.5, 10, verbose=1, min_lr=1e-6) early_stopping = EarlyStopping('val_loss', patience=100, restore_best_weights=True) logger = CSVLogger(model_path+'.csv', append=True) tensorboard = TensorBoard(model_path[:model_path.rfind('.')]+'_logs', batch_size=1024, update_freq='epoch') model.fit(trainX, [trainY, missing_col], batch_size=128, epochs=500, validation_data=(validX, [validY, valid_missing_col]), verbose=2, callbacks=[checkpoint, reduce_lr, logger, tensorboard, early_stopping]) if submit: testX = utils.load_test_data(submit) Y = np.array(list(map(idx2word.get, np.argmax(model.predict(testX, batch_size=1024)[0], axis=-1)))) utils.submit(Y) elif svm: X, Y = utils.load_train_data(data_path) Y = np.array(list(map(word2idx.get, Y.ravel()))) trainX = X[:, 1:] _, f1 = model.predict(trainX, batch_size=1024) X[:, 0] = f1.ravel() clf_svm(X, Y, save_model=svm) else: model.load_weights(model_path) print(f'\n\033[32;1mTraining score: {model.evaluate(trainX, [trainY, missing_col], verbose=0, batch_size=1024)}') print(f'Validation Score: {model.evaluate(validX, [validY, valid_missing_col], verbose=0, batch_size=1024)}\033[0m')
import os import sys from utils import submit cache_dir = sys.argv[1] host_name = sys.argv[2] time_series = sys.argv[3] environmental_data = sys.argv[4] output_dir = sys.argv[5] submit(host_name, cache_dir, "datatrans.PreprocPerPatSeriesNearestRoad", "--patgeo_data=" + time_series, "--output_file=" + output_dir, "--nearestroad_data=" + environmental_data, *sys.argv[6:])
def mk_submit(): # ============================================================================= # load # ============================================================================= # train X_train = loader.train() col = [c for c in X_train.columns if c.startswith('f702_')] X_train.drop(col, axis=1, inplace=True) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category())) print('CAT :', CAT) COL = X_train.columns.tolist() # test X_test = loader.test()[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(loop {i}): {auc_mean} {ret['auc-mean'][-1]}" print(result) utils.send_line(result) y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean: {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
sub = X[['click_id']] sub.click_id = sub.click_id.map(int) X.drop('click_id', axis=1, inplace=True) X.fillna(-1, inplace=True) dtest = xgb.DMatrix(X[train_head.columns]) del X; gc.collect() sub['is_attributed'] = 0 y_pred = model.predict(dtest) sub['is_attributed'] += pd.Series(y_pred).rank() #sub['is_attributed'] /= LOOP sub['is_attributed'] /= sub['is_attributed'].max() sub['click_id'] = sub.click_id.map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: utils.submit(SUBMIT_FILE_PATH) #============================================================================== utils.end(__file__)
improved my scores. """ FILE_out = '../output/1022-1_Giba-post2.csv.gz' sub.loc[sub.object_id.isin(oid_gal), 'class_99'] = 0.017 sub.loc[sub.object_id.isin(oid_exgal), 'class_99'] = 0.17 sub.loc[sub.object_id.isin(oid_gal), [f'class_{i}' for i in utils.classes_exgal]] = 0 sub.loc[sub.object_id.isin(oid_exgal), [f'class_{i}' for i in utils.classes_gal]] = 0 sub.to_csv(FILE_out, index=False, compression='gzip') utils.submit(FILE_out, '0.017 and 0.17') # ============================================================================= # yuval post # ============================================================================= """ yuval says I believe a prerequisite for predicting Class 99 is having a very good model for the other classes. The good new is that a score of ~1.0 is achievable without really dealing with class 99. The only thing I did up to this point with this prediction is: class_99=np.where(other_classes.max>0.9 , 0.01, 0.1) [it is slightly better then uniform, If I use 0.8 the score degrades, and also other values of probabilities degrade the score]
test[target] = manage.base_test[target].values stack_cols = [key, target, pred_col] df_stack = pd.concat([train[stack_cols], test[stack_cols]], ignore_index=True, axis=0) #======================================================================== # Saving feim.to_csv( f'../valid/{start_time[4:12]}_valid_{model_type}_SET-{set_type}_feat{n_feature}_{comment}_CV{str(cv_score)[:7]}_LB.csv', index=True) utils.to_pkl_gzip( obj=df_stack, path= f'../stack/{start_time[4:12]}_stack_{model_type}_SET-{set_type}_feat{n_feature}_{comment}_CV{str(cv_score)[:7]}_LB' ) submit = pd.read_csv('../input/sample_submission.csv').set_index(key) submit[target] = test[pred_col].values submit_path = f'../submit/{start_time[4:12]}_submit_{model_type}_SET-{set_type}_feat{n_feature}_{comment}_CV{str(cv_score)[:7]}_LB.csv' submit.to_csv(submit_path, index=True) if is_submit: utils.submit(file_path=submit_path, comment=comment, COMPETITION_NAME=COMPETITION_NAME) shutil.move(submit_path, '../log_submit/') #========================================================================
def DO(frm, to, fileno): dtypes = { 'ip': 'uint32', 'app': 'uint16', 'device': 'uint16', 'os': 'uint16', 'channel': 'uint16', 'is_attributed': 'uint8', 'click_id': 'uint32', } print('loading train data...', frm, to) train_df = pd.read_csv("../input/train.csv.zip", parse_dates=['click_time'], skiprows=range(1, frm), nrows=to - frm, dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed' ]) print('loading test data...') if debug: test_df = pd.read_csv("../input/test.csv.zip", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id' ]) else: test_df = pd.read_csv("../input/test.csv.zip", parse_dates=['click_time'], dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id' ]) len_train = len(train_df) train_df = train_df.append(test_df) del test_df gc.collect() print('Extracting new features...') train_df['hour'] = pd.to_datetime( train_df.click_time).dt.hour.astype('uint8') train_df['day'] = pd.to_datetime( train_df.click_time).dt.day.astype('uint8') gc.collect() naddfeat = 9 for i in range(0, naddfeat): if i == 0: selcols = ['ip', 'channel'] QQ = 4 if i == 1: selcols = ['ip', 'device', 'os', 'app'] QQ = 5 if i == 2: selcols = ['ip', 'day', 'hour'] QQ = 4 if i == 3: selcols = ['ip', 'app'] QQ = 4 if i == 4: selcols = ['ip', 'app', 'os'] QQ = 4 if i == 5: selcols = ['ip', 'device'] QQ = 4 if i == 6: selcols = ['app', 'channel'] QQ = 4 if i == 7: selcols = ['ip', 'os'] QQ = 5 if i == 8: selcols = ['ip', 'device', 'os', 'app'] QQ = 4 print('selcols', selcols, 'QQ', QQ) filename = 'X%d_%d_%d.csv' % (i, frm, to) if os.path.exists(filename): if QQ == 5: gp = pd.read_csv(filename, header=None) train_df['X' + str(i)] = gp else: gp = pd.read_csv(filename) train_df = train_df.merge(gp, on=selcols[0:len(selcols) - 1], how='left') else: if QQ == 0: gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].count().reset_index().\ rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)}) train_df = train_df.merge(gp, on=selcols[0:len(selcols) - 1], how='left') if QQ == 1: gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].mean().reset_index().\ rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)}) train_df = train_df.merge(gp, on=selcols[0:len(selcols) - 1], how='left') if QQ == 2: gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].var().reset_index().\ rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)}) train_df = train_df.merge(gp, on=selcols[0:len(selcols) - 1], how='left') if QQ == 3: gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].skew().reset_index().\ rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)}) train_df = train_df.merge(gp, on=selcols[0:len(selcols) - 1], how='left') if QQ == 4: gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].nunique().reset_index().\ rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)}) train_df = train_df.merge(gp, on=selcols[0:len(selcols) - 1], how='left') if QQ == 5: gp = train_df[selcols].groupby( by=selcols[0:len(selcols) - 1])[selcols[len(selcols) - 1]].cumcount() train_df['X' + str(i)] = gp.values if not debug: gp.to_csv(filename, index=False) del gp gc.collect() print('doing nextClick') predictors = [] new_feature = 'nextClick' filename = 'nextClick_%d_%d.csv' % (frm, to) if os.path.exists(filename): print('loading from save file') QQ = pd.read_csv(filename).values else: D = 2**26 train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \ + "_" + train_df['os'].astype(str)).apply(hash) % D click_buffer = np.full(D, 3000000000, dtype=np.uint32) train_df['epochtime'] = train_df['click_time'].astype( np.int64) // 10**9 next_clicks = [] for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)): next_clicks.append(click_buffer[category] - t) click_buffer[category] = t del (click_buffer) QQ = list(reversed(next_clicks)) if not debug: print('saving') pd.DataFrame(QQ).to_csv(filename, index=False) train_df[new_feature] = QQ predictors.append(new_feature) train_df[new_feature + '_shift'] = pd.DataFrame(QQ).shift(+1).values predictors.append(new_feature + '_shift') del QQ gc.collect() print('grouping by ip-day-hour combination...') gp = train_df[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'hour'])[[ 'channel' ]].count().reset_index().rename( index=str, columns={'channel': 'ip_tcount'}) train_df = train_df.merge(gp, on=['ip', 'day', 'hour'], how='left') del gp gc.collect() print('grouping by ip-app combination...') gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[[ 'channel' ]].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'}) train_df = train_df.merge(gp, on=['ip', 'app'], how='left') del gp gc.collect() print('grouping by ip-app-os combination...') gp = train_df[['ip', 'app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[[ 'channel' ]].count().reset_index().rename( index=str, columns={'channel': 'ip_app_os_count'}) train_df = train_df.merge(gp, on=['ip', 'app', 'os'], how='left') del gp gc.collect() # Adding features with var and mean hour (inspired from nuhsikander's script) print('grouping by : ip_day_chl_var_hour') gp = train_df[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'channel'])[[ 'hour' ]].var().reset_index().rename( index=str, columns={'hour': 'ip_tchan_count'}) train_df = train_df.merge(gp, on=['ip', 'day', 'channel'], how='left') del gp gc.collect() print('grouping by : ip_app_os_var_hour') gp = train_df[['ip', 'app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[[ 'hour' ]].var().reset_index().rename( index=str, columns={'hour': 'ip_app_os_var'}) train_df = train_df.merge(gp, on=['ip', 'app', 'os'], how='left') del gp gc.collect() print('grouping by : ip_app_channel_var_day') gp = train_df[['ip', 'app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[[ 'day' ]].var().reset_index().rename( index=str, columns={'day': 'ip_app_channel_var_day'}) train_df = train_df.merge(gp, on=['ip', 'app', 'channel'], how='left') del gp gc.collect() print('grouping by : ip_app_chl_mean_hour') gp = train_df[['ip', 'app', 'channel', 'hour']].groupby(by=['ip', 'app', 'channel'])[[ 'hour' ]].mean().reset_index().rename( index=str, columns={'hour': 'ip_app_channel_mean_hour'}) print("merging...") train_df = train_df.merge(gp, on=['ip', 'app', 'channel'], how='left') del gp gc.collect() print("vars and data type: ") train_df.info() train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16') train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16') train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16') target = 'is_attributed' predictors.extend([ 'app', 'device', 'os', 'channel', 'hour', 'day', 'ip_tcount', 'ip_tchan_count', 'ip_app_count', 'ip_app_os_count', 'ip_app_os_var', 'ip_app_channel_var_day', 'ip_app_channel_mean_hour' ]) categorical = ['app', 'device', 'os', 'channel', 'hour', 'day'] for i in range(0, naddfeat): predictors.append('X' + str(i)) print('predictors', predictors) test_df = train_df[len_train:] val_df = train_df[(len_train - val_size):len_train] train_df = train_df[:(len_train - val_size)] print("train size: ", len(train_df)) print("valid size: ", len(val_df)) print("test size : ", len(test_df)) sub = pd.DataFrame() sub['click_id'] = test_df['click_id'].astype('int') gc.collect() print("Training...") start_time = time.time() params = { 'learning_rate': 0.20, #'is_unbalance': 'true', # replaced with scale_pos_weight argument 'num_leaves': 7, # 2^max_depth - 1 'max_depth': 3, # -1 means no limit 'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf) 'max_bin': 100, # Number of bucketed bin for feature values 'subsample': 0.7, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'colsample_bytree': 0.9, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'scale_pos_weight': 200 # because training data is extremely unbalanced } (bst, best_iteration) = lgb_modelfit_nocv(params, train_df, val_df, predictors, target, objective='binary', metrics='auc', early_stopping_rounds=30, verbose_eval=True, num_boost_round=1000, categorical_features=categorical) print('[{}]: model training time'.format(time.time() - start_time)) del train_df del val_df gc.collect() # print('Plot feature importances...') # ax = lgb.plot_importance(bst, max_num_features=100) # plt.show() print("Predicting...") sub['is_attributed'] = bst.predict(test_df[predictors], num_iteration=best_iteration) if not debug: print("writing...") sub.to_csv('sub_it%d.csv.gz' % (fileno), index=False, compression='gzip') utils.submit('sub_it%d.csv.gz' % (fileno)) print("done...") return sub
import os import sys from utils import submit cache_dir = sys.argv[1] host_name = sys.argv[2] time_series = sys.argv[3] environmental_data = sys.argv[4] acs_data = sys.argv[5] output_dir = sys.argv[6] submit(host_name, cache_dir, "datatrans.PreprocPerPatSeriesACS2", "--patgeo_data=" + time_series, "--output_file=" + output_dir, "--geoid_data=" + environmental_data, "--acs_data=" + acs_data, *sys.argv[7:])
else: out_pred = df_pred[df_pred[key].isin( out_ids)]['pred_mean'].values out_score = np.sqrt(mean_squared_error(out_val, out_pred)) else: out_score = 0 else: out_score = 0 #======================================================================== # Submission test_pred = seed_pred / len(seed_list) submit[target] = test_pred submit_path = f'../submit/{start_time[4:12]}_submit_{model_type}_lr{learning_rate}_{feature_num}feats_{len(seed_list)}seed_{num_leaves}leaves_iter{iter_avg}_OUT{str(out_score)[:7]}_CV{cv_score}_LB.csv' submit.to_csv(submit_path, index=False) if go_submit: import shutil comment = sys.argv[1] if len(comment): try: lb_pb = utils.submit(file_path=submit_path, comment=comment) except IndexError: lb_pb = utils.submit(file_path=submit_path) shutil.move(submit, '../log_submit/') submit_path = submit_path.replace('LB', f'LB{lb_pb[0]}') submit.to_csv(submit_path, index=False) #========================================================================
def mk_submit(): files_tr = ('../feature/train_' + features + '.f').tolist() files_te = ('../feature/test_' + features + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = loader.train() X_train_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) X_train = pd.concat([X_train, X_train_], axis=1) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category())) COL = X_train.columns.tolist() # test X_test = loader.test() X_test_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1) X_test = pd.concat([X_test, X_test_], axis=1)[COL] # ============================================================================= # groupKfold # ============================================================================= sk_tbl = pd.read_csv('../data/user_id_v8.csv.gz') # TODO: check user_tbl = sk_tbl.user_id.drop_duplicates().reset_index( drop=True).to_frame() sub_train = pd.read_csv('../input/application_train.csv.zip', usecols=['SK_ID_CURR']).set_index('SK_ID_CURR') sub_train['y'] = y_train.values group_kfold = GroupKFold(n_splits=NFOLD) # ============================================================================= # training with cv # ============================================================================= model_all = [] auc_mean = 0 for i in range(LOOP): dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) # shuffle fold ids = list(range(user_tbl.shape[0])) np.random.shuffle(ids) user_tbl['g'] = np.array(ids) % NFOLD sk_tbl_ = pd.merge(sk_tbl, user_tbl, on='user_id', how='left').set_index('SK_ID_CURR') sub_train['g'] = sk_tbl_.g folds = group_kfold.split(X_train, sub_train['y'], sub_train['g']) gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, folds=folds, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models auc_mean += ret['auc-mean'][-1] auc_mean /= LOOP result = f"CV auc-mean({COMMENT}): {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
if os.path.exists(model_path): model.load_weights(model_path) print('\033[32;1mLoad Model\033[0m') plot_model(model, 'model.jpg') if training: checkpoint = ModelCheckpoint(model_path, 'val_loss', verbose=1, save_best_only=True, save_weights_only=True) reduce_lr = ReduceLROnPlateau('val_loss', 0.5, 10, verbose=1, min_lr=1e-6) logger = CSVLogger(model_path+'.csv', append=True) tensorboard = TensorBoard(model_path[:model_path.rfind('.')]+'_logs', batch_size=1024, update_freq='epoch') model.fit(trainX, [trainY, missing_col[:, 0], missing_col[:, 1], missing_col[:, 2]], batch_size=256, epochs=100, validation_data=(validX, [validY, valid_missing_col[:, 0], valid_missing_col[:, 1], valid_missing_col[:, 2]]), verbose=2, callbacks=[checkpoint, reduce_lr, logger, tensorboard]) if submit: out = tf.cast(out*2, tf.int32) submit_model = Model(I, out) utils.submit(submit_model, submit) elif svm: X, Y = utils.load_train_data(data_path) Y = Y.astype(int) trainX = np.delete(X, [1, 6, 11], axis=1) _, f2, f7, f12 = model.predict(trainX, batch_size=1024) X[:, 1] = f2.ravel() X[:, 6] = f7.ravel() X[:, 11] = f12.ravel() clf_svm(X, Y, save_model=svm) else: model.load_weights(model_path) print(f'\n\033[32;1mTraining score: {model.evaluate(trainX, [trainY, missing_col[:, 0], missing_col[:, 1], missing_col[:, 2]], verbose=0)}') print(f'Validation Score: {model.evaluate(validX, [validY, valid_missing_col[:, 0], valid_missing_col[:, 1], valid_missing_col[:, 2]], verbose=0)}\033[0m')
import sys from utils import submit host_name, input_dir, resc_types, skip_preproc, output_dir, *args = sys.argv[1:] = [] args.extend(sys.argv[6:]) submit(host_name, cache_dir, "datatrans.PreprocFIHR", "--input_dir=" + input_dir, "--resc_types=" + resc_types, "--skip_preproc=" + skip_preproc, "--output_dir=" + output_dir, *args)
x_train = x_train[:size * int(len(x_train) / size)] y_train = y_train[:size * int(len(y_train) / size)] x_test = x_test[:size * int(len(x_test) / size)] y_test = y_test[:size * int(len(y_test) / size)] # print(x_train.shape, y_train.shape, x_test.shape, y_test.shape) x_train_mean = np.mean(x_train, axis=0) x_train -= x_train_mean x_test -= x_train_mean print("start parseLabel") y_train, y_test = parseLabel(y_train, y_test) print("end parseLabel") # model = resnet_v1(depth=32, num_classes=num_classes, metrics='acc') model = zsl_res(metrics={"attr_out": attr_acc, "emb_out": emb_acc}) # model = res_pretrain_finetune() model.summary() model.load_weights( "../data/saved_models_zsl/zsl_model_res_pretrain_finetune.050.h5", by_name=False) # model = load_model("weights/my1/w1.h5") model.fit(x_train, y_train, batch_size=size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, callbacks=callbacks()) model.save("../data/saved_models_zsl/zsl_model_res_pretrain_finetune2.h5") submit(model, x_train_mean)
def _path(used, adjacency): available = [el for el in range(len(adjacency)) if not used[el]] root = np.random.choice(available) path_1 = _dfs(root, used, adjacency) path_2 = _dfs(root, used, adjacency) path_1.reverse() return path_1[:-1] + path_2 def solve(adjacency): used = np.full(shape=len(adjacency), fill_value=False) solution = [] with tqdm.tqdm(total=len(adjacency)) as progress: while not np.all(used): path = _path(used, adjacency) solution.extend(path) progress.update(len(path)) return solution if __name__ == "__main__": filename = input_files[1] alignments, tags = utils.read_input(filename) graph = convert_to_graph(tags) adj_size = list(map(len, graph)) print(max(adj_size), min(adj_size), sum(adj_size) / len(adj_size)) solution = solve(graph) print("Total Score for %s: %s" % (filename, utils.score(filename, solution))) utils.submit(filename, solution)
import os import sys from utils import submit cache_dir = sys.argv[1] host_name = sys.argv[2] patient_dir = sys.argv[3] environment_dir = sys.argv[4] input_files = sys.argv[5] deidentify = sys.argv[6] output_dir = sys.argv[7] submit(host_name, cache_dir, "datatrans.PreprocCSVTable", "--patient_directory=" + patient_dir, "--environment_directory=" + environment_dir, "--input_files=" + input_files, "--deidentify=" + deidentify, "--output_directory=" + output_dir, *sys.argv[8:])
from utils import submit import sys host = sys.argv[1] cache_dir = sys.argv[2] args = sys.argv[3:] submit(host, cache_dir, "tic.Transform", *args)
import sys from utils import submit cache_dir = sys.argv[1] host_name = sys.argv[2] dir = sys.argv[3] year = sys.argv[4] submit(host_name, cache_dir, "datatrans.PreprocDailyEnvData", "--input_directory={0}/cmaq{1}".format(dir, year), "--output_prefix={0}/cmaq{1}/".format(dir, year))
import numpy as np import pandas as pd import utils EXE_SUBMIT = True SEED = 71 np.random.seed(SEED) FILE_in = '../output/matsuken-875_onodera-884_taguchi-888u_akiyama-889u.csv.gz' FILE_out = '../output/LB839_c99_uniform.csv.gz' COMMENT = 'np.random.uniform(1.2, 1.4)' sub = pd.read_csv(FILE_in) sub.class_99 *= np.random.uniform(1.2, 1.4, size=sub.shape[0]) sub.iloc[:, 1:] = sub.iloc[:, 1:].values / sub.iloc[:, 1:].sum(1).values[:, None] sub.to_csv(FILE_out, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(FILE_out, COMMENT)
def main(): #======================================================================== # Data Load #======================================================================== base = utils.read_df_pkl('../input/base_app*') win_path_list = glob.glob(win_path) train_path_list = [] test_path_list = [] for path in win_path_list: if path.count('train'): train_path_list.append(path) elif path.count('test'): test_path_list.append(path) base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) train_feature_list = utils.pararell_load_data(path_list=train_path_list) test_feature_list = utils.pararell_load_data(path_list=test_path_list) train = pd.concat(train_feature_list, axis=1) train = pd.concat([base_train, train], axis=1) test = pd.concat(test_feature_list, axis=1) test = pd.concat([base_test, test], axis=1) ir_list = [col for col in test.columns if col.count('ir_')] # test[ir_list] = test[ir_list] + 0.005 # train['CNT_PAYMENT@'] = train['CNT_PAYMENT@'].where( train['CNT_PAYMENT@']<=39, np.nan) # test['CNT_PAYMENT@'] = test['CNT_PAYMENT@'].where( test['CNT_PAYMENT@']<=39, np.nan) # 実験用 # df = utils.read_df_pkl('../input/clean_app*').sample(50000) # train = df[df[target]>=0] # test = df[df[target].isnull()] metric = 'auc' fold = 5 fold_type = 'stratified' group_col_name = '' dummie = 1 oof_flg = True LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list) train, test, drop_list = LGBM.data_check(train=train, test=test, target=target) if len(drop_list): train.drop(drop_list, axis=1, inplace=True) test.drop(drop_list, axis=1, inplace=True) #======================================================================== # Train & Prediction Start #======================================================================== LGBM = LGBM.cross_prediction(train=train, test=test, key=key, target=target, fold_type=fold_type, fold=fold, group_col_name=group_col_name, params=params, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, oof_flg=oof_flg) #======================================================================== # Result #======================================================================== cv_score = LGBM.cv_score result = LGBM.prediction cv_feim = LGBM.cv_feim feature_num = len(LGBM.use_cols) cv_feim.to_csv( f'../valid/{start_time[4:12]}_{model_type}_{fname}_feat{feature_num}_CV{cv_score}_lr{learning_rate}.csv', index=False) #======================================================================== # X-RAYの計算と出力 # Args: # model : 学習済のモデル # train : モデルの学習に使用したデータセット # col_list : X-RAYの計算を行うカラムリスト。指定なしの場合、 # データセットの全カラムについて計算を行うが、 # 計算時間を考えると最大30カラム程度を推奨。 #======================================================================== if xray: train.reset_index(inplace=True) train = train[LGBM.use_cols] result_xray = pd.DataFrame() N_sample = 500000 max_point = 30 for fold_num in range(fold): model = LGBM.fold_model_list[fold_num] if fold_num == 0: xray_obj = Xray_Cal(logger=logger, ignore_list=ignore_list, model=model) xray_obj, tmp_xray = xray_obj.get_xray(base_xray=train, col_list=train.columns, fold_num=fold_num, N_sample=N_sample, max_point=max_point, Pararell=True) tmp_xray.rename(columns={'xray': f'xray_{fold_num}'}, inplace=True) if len(result_xray): result_xray = result_xray.merge(tmp_xray.drop('N', axis=1), on=['feature', 'value'], how='inner') else: result_xray = tmp_xray.copy() del tmp_xray gc.collect() xray_col = [col for col in result_xray.columns if col.count('xray')] result_xray['xray_avg'] = result_xray[xray_col].mean(axis=1) result_xray.to_csv( f'../output/{start_time[4:10]}_xray_{model_type}_CV{LGBM.cv_score}.csv', index=False) sys.exit() submit = pd.read_csv('../input/sample_submission.csv') # submit = [] #======================================================================== # STACKING #======================================================================== if len(stack_name) > 0: logger.info(f'result_stack shape: {LGBM.result_stack.shape}') utils.to_pkl( path= f"../stack/{start_time[4:12]}_{stack_name}_{model_type}_CV{str(cv_score).replace('.', '-')}_{feature_num}features.fp", obj=LGBM.result_stack) logger.info( f'FEATURE IMPORTANCE PATH: {HOME}/kaggle/home-credit-default-risk/output/cv_feature{feature_num}_importances_auc_{cv_score}.csv' ) #======================================================================== # Submission #======================================================================== if len(submit) > 0: if stack_name == 'add_nest': test[target] = result test = test.reset_index()[[ key, target ]].groupby(key)[target].mean().reset_index() submit = submit[key].to_frame().merge(test, on=key, how='left') submit[target].fillna(0, inplace=True) submit_path = f'../submit/{start_time[4:12]}_submit_{fname}_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv' submit.to_csv(submit_path, index=False) utils.submit(file_path=submit_path) else: submit[target] = result submit.to_csv( f'../submit/{start_time[4:12]}_submit_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv', index=False)
def mk_submit(): files_tr = ('../feature/train_' + features + '.f').tolist() files_te = ('../feature/test_' + features + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = loader.train() X_train_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) X_train = pd.concat([X_train, X_train_], axis=1) y_train = utils.read_pickles('../data/label').TARGET # remove old users X_train = X_train[new_train_users] y_train = y_train[new_train_users] X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category())) COL = X_train.columns.tolist() # test X_test = loader.test() X_test_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1) X_test = pd.concat([X_test, X_test_], axis=1)[COL] # ============================================================================= # training with cv # ============================================================================= model_all = [] auc_mean = 0 for i in range(LOOP): dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models auc_mean += ret['auc-mean'][-1] auc_mean /= LOOP result = f"CV auc-mean({COMMENT}): {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
sub = X[['click_id']] sub.click_id = sub.click_id.map(int) X.drop('click_id', axis=1, inplace=True) X.fillna(-1, inplace=True) dtest = xgb.DMatrix(X[train_head.columns]) del X; gc.collect() sub['is_attributed'] = 0 y_pred = model.predict(dtest) sub['is_attributed'] += pd.Series(y_pred).rank() #sub['is_attributed'] /= LOOP sub['is_attributed'] /= sub['is_attributed'].max() sub['click_id'] = sub.click_id.map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: utils.submit(SUBMIT_FILE_PATH, COMMENT) #============================================================================== utils.end(__file__)
def main(): # load submission files print('load files...') sub_weekday = pd.read_csv( '../output/submission_lgbm_group_k_fold_weekday.csv') sub_holiday = pd.read_csv( '../output/submission_lgbm_group_k_fold_holiday.csv') # load oof files oof_weekday = pd.read_csv('../output/oof_lgbm_group_k_fold_weekday.csv') oof_holiday = pd.read_csv('../output/oof_lgbm_group_k_fold_holiday.csv') # merge sub = sub_weekday.append(sub_holiday) oof = oof_weekday.append(oof_holiday) del sub_weekday, sub_holiday, oof_weekday, oof_holiday gc.collect() # to pivot print('to pivot...') sub = sub.pivot(index='id', columns='d', values='demand').reset_index() oof = oof.pivot(index='id', columns='d', values='demand').reset_index() # split test1 / test2 sub1 = oof[['id'] + COLS_TEST1] sub2 = sub[['id'] + COLS_TEST2] # change column names sub1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] sub2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] # replace test1 id sub1['id'] = sub1['id'].str.replace('_evaluation', '_validation') # merge sub = sub1.append(sub2) # postprocesssing cols_f = [f'F{i}' for i in range(1, 29)] cols_d = [c for c in oof.columns if 'd_' in c] sub.loc[:, cols_f] = sub[cols_f].where(sub[cols_f] > 0, 0) oof.loc[:, cols_d] = oof[cols_d].where(oof[cols_d] > 0, 0) # save csv sub.to_csv(submission_file_name, index=False) oof.to_csv(oof_file_name, index=False) # calc out of fold WRMSSE score print('calc oof cv scores...') scores = calc_score_cv(oof) score = np.mean(scores) print(f'scores: {scores}') # submission by API submit(submission_file_name, comment='model411 cv: %.6f' % score) # LINE notify line_notify('{} done. WRMSSE:{}'.format(sys.argv[0], round(score, 6)))