def main(): logger.debug('config: {}'.format(options.config)) logger.debug(feats) logger.debug(model_params) # 指定した特徴量からデータをロード X_train_all, X_test = load_datasets(feats) y_train_all = load_target(target_name) cols = X_train_all.columns # stacking if "stacking" in config and config["stacking"] == True: oof_df, test_df = stack_load_df(config["stacking_name"]) X_train_all = pd.concat([X_train_all, oof_df], axis=1) X_test = pd.concat([X_test, test_df], axis=1) if (model_name != "lightgbm") or ("sampling" in config): logger.debug("rank gauss") scaler = QuantileTransformer(n_quantiles=100, random_state=model_params["seed"], output_distribution="normal") all_df = pd.concat([X_train_all, X_test]) all_df = all_df.fillna(all_df.median()) # 欠損値埋め all_df[cols] = scaler.fit_transform(all_df[cols]) # scale X_train_all = all_df[:X_train_all.shape[0]].reset_index(drop=True) X_test = all_df[X_train_all.shape[0]:].reset_index(drop=True) logger.debug("X_train_all shape: {}".format(X_train_all.shape)) print(X_train_all.info()) # seed ごとにループ class_cols = [i for i in range(model_params["num_class"])] oof_df = pd.DataFrame(index=[i for i in range(X_train_all.shape[0])], columns=class_cols) sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) oof_df[class_cols] = 0 sub[target_name] = 0 for seed_num in range(config["seed_num"]): logger.debug(f"SEED: {seed_num}") one_oof_df, one_sub = train_and_predict(X_train_all, y_train_all, X_test, seed_num=seed_num) oof_df[class_cols] += one_oof_df[class_cols] / config["seed_num"] sub[target_name] += one_sub[target_name] / config["seed_num"] auc_score = evaluate_score(y_train_all.values, oof_df.values[:, 1], "auc") acc_score = evaluate_score(y_train_all.values, oof_df.values.argmax(axis=1), "acc") logloss_score = evaluate_score(y_train_all.values, oof_df.values[:, 1], "logloss") logger.debug('=== OOF CV scores ===') logger.debug( f"\t auc:{auc_score}, acc: {acc_score}, logloss: {logloss_score}") sub = sub.rename(columns={ID_name: 'Id', target_name: "label"}) oof_df.to_csv(f'./data/output/oof_{config_filename}.csv', index=False) sub.to_csv(f'./data/output/sub_{config_filename}.csv', index=False)
def main(): logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now)) logging.debug('config: {}'.format(options.config)) logging.debug(feats) logging.debug(params) # 指定した特徴量からデータをロード X_train_all, X_test = load_datasets(feats) y_train_all = load_target(target_name) logging.debug("X_train_all shape: {}".format(X_train_all.shape)) stacking(X_train_all, y_train_all, X_test)
def main(): logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now)) logging.debug('config: {}'.format(options.config)) logging.debug(feats) logging.debug(params) # 指定した特徴量からデータをロード X_train_all, X_test = load_datasets(feats) y_train_all = load_target(target_name) logging.debug("X_train_all shape: {}".format(X_train_all.shape)) if config['model'] == 'LightGBM': train_and_predict_lightgbm(X_train_all, y_train_all, X_test) elif config['model'] in ['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet', 'KernelRidge', "SVR", "XGBoost", "RandomForest", "GradientBoosting", "CatBoost"]: train_and_predict_linear(X_train_all, y_train_all, X_test)
gc.enable() from collections import OrderedDict parser = argparse.ArgumentParser() parser.add_argument('--config', default='./configs/config001.debug.json') parser.add_argument('--debug', default='False') parser.add_argument('--tuning', action='store_true') options = parser.parse_args() with open(options.config, "r") as fp: conf = json.load(fp, object_pairs_hook=OrderedDict) trn, tst = load_feature_sets(options.config) target = utils.load_target() trn = pd.concat([target, trn], axis=1) trn['target_outlier'] = 0 trn.loc[(trn.target < -30), 'target_outlier'] = 1 trn.drop(columns=['target'], inplace=True) def get_feature_importances(trn, y, shuffle, seed=None): # Gather real features if shuffle: # Here you could as well use a binomial distribution y = y.copy().sample(frac=1.0) # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
files_tr = [] for pref in PREFS: files_tr += glob(f'../data/train_{pref}*.pkl') files_te = [f'../feature/test_{c}.pkl' for c in COL] sw = False for i in files_te: if os.path.exists(i) == False: print(i) sw = True if sw: raise Exception() X = pd.concat([pd.read_pickle(f) for f in tqdm(files_tr, mininterval=60)], axis=1)[COL] y = utils.load_target().target #X.drop(DROP, axis=1, inplace=True) target_dict = {} target_dict_r = {} for i, e in enumerate(y.sort_values().unique()): target_dict[e] = i target_dict_r[i] = e y = y.replace(target_dict) if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}')
options = parser.parse_args() config = json.load(open(options.config)) now = datetime.datetime.now() logging.basicConfig( filename='./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now), level=logging.DEBUG ) logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now)) feats = config['features'] logging.debug(feats) target_name = config['target_name'] X_train_all, X_test = load_datasets(feats, target_name) y_train_all = load_target(target_name) logging.debug(X_train_all.shape) y_preds = [] models = [] lgbm_params = config['lgbm_params'] kf = KFold(n_splits=10, random_state=0) for train_index, valid_index in kf.split(X_train_all): X_train, X_valid = ( X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :] ) y_train, y_valid = y_train_all[train_index], y_train_all[valid_index] # lgbmの実行
li = [] for i in files_tr: for j in USE_PREF: if j in i: li.append(i) break files_tr = li [print(i, f) for i, f in enumerate(files_tr)] X_train = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=30)] + [joblib.load('../external/X_train_nejumi.pkl.gz')], axis=1) y_train = utils.load_target()['HasDetections'] # drop if len(col_drop) > 0: X_train.drop(col_drop, axis=1, inplace=True) if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(utils_cat.ALL)) print(f'CAT: {CAT}')
def train_model(config, _debug, logger, start_dt, train_and_predict): """ train model with features. model and features are designated in config """ features = config['features'] label_name = config['label_name'] id_name = config['id_name'] # load only train features and label x_train_all = load_features(features, _debug, target='train') y_train_all = load_target(label_name, _debug) gc.collect() logger.debug('x_train_all:{0}'.format(x_train_all.shape)) logger.debug('y_train_all:{0}'.format(y_train_all.shape)) # save feature names and index feature_names = x_train_all.columns.tolist() x_train_idx = x_train_all.index # convert from df to matrix x_train_all = df_to_matrix(x_train_all) # load model params params = config['params'] seed = config['seed'] model_name = config['model_name'] # generate stratified k-fold instance n_splits = config['n_splits'] skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) # to store results y_te_prs = np.zeros(len(y_train_all)) scores_tr, scores_te = defaultdict(list), defaultdict(list) importances_df = pd.DataFrame() trained_models = [] # cross validation for _fold, (tr_idx, te_idx) in enumerate(skf.split(x_train_idx, y_train_all)): _fold += 1 logger.debug('------ {0} / {1} fold ------'.format(_fold, n_splits)) # extract dataset x_tr, x_te = x_train_all[tr_idx, :], x_train_all[te_idx, :] y_tr, y_te = y_train_all[tr_idx], y_train_all[te_idx] logger.debug('x_tr:{0} x_te:{1}'.format(x_tr.shape, x_te.shape)) logger.debug('y_tr:{0} y_te:{1}'.format(y_tr.shape, x_te.shape)) # train model y_tr_pr, y_te_pr, model = train_and_predict(x_tr, y_tr, x_te, params) # save prediction y_te_prs[te_idx] += y_te_pr / (n_splits - 1) # compute metric scores_tr = calc_metrics(scores_tr, y_tr_pr, y_tr) scores_te = calc_metrics(scores_te, y_te_pr, y_te) logger.debug('[{0}f] train_acc:{1} test_acc:{2}'.format( _fold, scores_tr['acc'][-1], scores_te['acc'][-1])) logger.debug('[{0}f] train_auc:{1} test_auc:{2}'.format( _fold, scores_tr['auc'][-1], scores_te['auc'][-1])) # save model trained_models.append(model) # feature importance if hasattr(model, 'feature_importances_'): importances_df['{}_fold'.format( _fold)] = model.feature_importances_ elif hasattr(model, 'coef_'): importances_df['{}_fold'.format(_fold)] = model.coef_.flatten() del x_tr, x_te, y_tr, y_te, y_tr_pr, y_te_pr, model gc.collect() # mean metrics scores_cv_tr = np.mean(pd.DataFrame(scores_tr), axis=0) scores_cv_te = np.mean(pd.DataFrame(scores_te), axis=0) logger.debug('------ cross validation ------') logger.debug('[cv] train_acc:{0}, test_acc:{1}'.format( scores_cv_tr['acc'], scores_cv_te['acc'])) logger.debug('[cv] train_auc:{0}, test_auc:{1}'.format( scores_cv_tr['auc'], scores_cv_te['auc'])) if importances_df.any(axis=None): # mean feature importance importances_df = pd.DataFrame({ 'feature': feature_names, 'importance': np.mean(importances_df, axis=1) }) # save file_name = 'importances_{0:%m%d_%H%M%S}_{1:.5f}_{2}'.format( start_dt, scores_cv_te['auc'], model_name) importances_df.to_csv('../../data/output/{0}.csv'.format(file_name), index=False) # plot fig = plot_importances(importances_df, file_name) fig.savefig( '../../figures/feature_importance/{0}.png'.format(file_name)) # save prediction on te dataset train_df = pd.read_pickle('../../data/input/train.pkl') if _debug: train_df = train_df.iloc[:int(train_df.shape[0] / 100)] y_te_prs_df = pd.DataFrame({ 'id': train_df[id_name], 'pred': y_te_prs, 'truth': y_train_all }) logger.debug('y_tr_prs_df:{0}'.format(y_te_prs_df.shape)) del train_df gc.collect() # save prediction on cross-validation test y_te_prs_df.to_pickle( '../../data/output/val_{0:%m%d_%H%M%S}_{1:.5f}_{2}.pkl'.format( start_dt, scores_cv_te['auc'], model_name)) del y_te_prs_df gc.collect() # save models model_path = '../../models/models_{0:%m%d_%H%M%S}_{1:.5f}_{2}.pkl'.format( start_dt, scores_cv_te['auc'], model_name) with open(model_path, 'wb') as f: pickle.dump(trained_models, f)
# 'f006', # 'f007', # 'f008', # 'f009', # 'f010', # 'f011', # 'f012', ] var_names = [f'var_{i:03}' for i in range(200)] # ============================================================================= # load # ============================================================================= y_train = utils.load_target()['target'] def load(var): files_tr = sorted(glob(f'../data/{var}/train_f*.pkl')) # USE_PREF li = [] for i in files_tr: for j in USE_PREF: if j in i: li.append(i) break files_tr = li
NROUND = 500 ESR = 50 VERBOSE_EVAL = 25 feature_size = 30 file_tr = '../data/f008/train_f008_1.f' file_te = '../data/f008/test_f008_1.f' outpath_tr = '../data/train_f008_1.f' outpath_te = '../data/test_f008_1.f' # ============================================================================= # load # ============================================================================= X_train = pd.read_feather(file_tr).sample(frac=0.5, random_state=SEED) y_train = utils.load_target().sample(frac=0.5, random_state=SEED)['HasDetections'] if len(DROP) > 0: X_train.drop(DROP, axis=1, inplace=True) if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') #print(f'X_valid.shape {X_valid.shape}') gc.collect() CAT = list(set(X_train.columns) & set(utils_cat.ALL)) print(f'CAT: {CAT}')
from sklearn.metrics import roc_auc_score import sys sys.path.append(f'/home/{os.environ.get("USER")}/PythonLibrary') import GA import utils #utils.start(__file__) # ============================================================================= # load # ============================================================================= X = pd.read_pickle('../external/share_904_oof_preds.pkl.gz') oof_pred_array = X.values y = utils.load_target()['target'] print( roc_auc_score(y, (9 * oof_pred_array / (1 - oof_pred_array)).prod(axis=1))) # ============================================================================= # def # ============================================================================= def myfitness(gtype): """ gtype[:200]: weight gtype[200:]: binary(use or not) """
#prepare the log file now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') logger = logging.getLogger('main') logger.setLevel(logging.DEBUG) sc = logging.StreamHandler() logger.addHandler(sc) fh = logging.FileHandler(f'logs/log_{now}.log') logger.addHandler(fh) logger.debug(f'logs/log_{now}.log') logger.debug(config_path) #load in datasets and target feats = config['feats'] target_name = config['target_name'] train, test = load_datasets(feats) target = load_target(target_name) molecule_name = feather.read_dataframe( './data/input/train.feather')['molecule_name'].values if is_debug_mode: print("Debug mode is ON!") train = train.iloc[:10000] test = test.iloc[:1000] target = target.iloc[:10000] molecule_name = molecule_name[:10000] train_type = train['type'].values test_type = test['type'].values logger.debug(feats) train.drop(['PropertyFunctor', 'type'], axis=1, inplace=True) #always nan
logger.debug(f'logs/log_{now}.log') logger.debug(config_path) logger.debug(f'is_debug_mode: {is_debug_mode}') logger.debug(f'keep_nans: {keep_nans}') #load in datasets and target if is_debug_mode: print("Debug mode is ON!") molecule_name = feather.read_dataframe('./data/input/train.feather')['molecule_name'].head(10000).values else: molecule_name = feather.read_dataframe('./data/input/train.feather')['molecule_name'].values feats = config['feats'] target_name = config['target_name'] train, test = load_datasets(feats, is_debug_mode) target = load_target(target_name, is_debug_mode) train_type = train['type'].values test_type = test['type'].values logger.debug(feats) #train.drop(['PropertyFunctor', 'type'], axis=1, inplace=True) #always nan #test.drop(['PropertyFunctor', 'type'], axis=1, inplace=True) #always nan if keep_nans: # simply keep nans as they are and let the lightgbm handle it categorical_cols = list(train.columns[train.dtypes == object]) logger.debug(categorical_cols)
parser.add_argument('--config', default='./configs/default.json') options = parser.parse_args() config = json.load(open(options.config)) now = datetime.datetime.now() logging.basicConfig(filename='./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now), level=logging.DEBUG) logging.debug('./logs/log_{0:%Y%m%d%H%M%S}.log'.format(now)) feats = config['features'] logging.debug(feats) # target_name = config['target_name'] feats_train, feats_test = load_datasets(feats) y_train_all = load_target() lr_Train = pd.concat([y_train_all, feats_train], axis=1) lr_Train.head() sc = MinMaxScaler(feature_range=(0, 1)) lr_Train_scaled = sc.fit_transform(lr_Train) X_Train = [] y_Train = [] for i in range(timesteps, 1913 - startDay): X_Train.append( lr_Train_scaled[i - timesteps:i]) #i = 14の場合、[0:14], i = 15の場合、[1:15]