def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config): params = { "objective": "regression" if config.is_regression() else "binary", "metric": "rmse" if config.is_regression() else "auc", "verbosity": -1, "seed": 1, } X_sample, y_sample = data_sample(X, y, config, nrows=20000) hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config) X_train, X_val, y_train, y_val = data_split(X, y, config) config["model"] = lgb.train( {**params, **hyperparams}, lgb.Dataset(X_train, label=y_train), 5000, lgb.Dataset(X_val, label=y_val), early_stopping_rounds=100, verbose_eval=100, ) config.save() try: with time_limit(config.time_left() - 10): config["model"] = lgb.train( {**params, **hyperparams}, lgb.Dataset(X, label=y), int(1.2 * config["model"].best_iteration), ) except TimeoutException: Log.print("Timed out!")
class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "line_id": [], "prediction": [], } for X in pd.read_csv(test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"]): result["line_id"] += list(X["line_id"]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) logf('SCORE:', score) else: score = None return result, score @timeit def save(self): self.config.save() @timeit def load(self): self.config.load()
def pipeline( df: pd.DataFrame, config: Config, train_csv: str = None, test_csv: str = None, prediction_csv: str = None) -> (pd.DataFrame, Optional[np.float64]): if config.is_train(): config['stages'] = {} for ids, stage in enumerate(config['graph']): if len(stage) == 0 or stage[0] is None or stage[0] == '': config["stage"] = '{0}/n{1}'.format( config["stage"], 'Error value stage "{0}" in pipeline'.format(stage)) raise ValueError(config["stage"]) config["stage"] = stage[0] config["stage_nb"] = ids if config.is_train(): config['stages'][config["stage"]] = {} config['stages'][config["stage"]]['time'] = 0 start_time = time.time() if stage[0] == 'Start': continue # elif stage[0] == 'End': # break elif not stage[0] in config['params']['pipeline']: config["stage"] = '{0}/n{1}'.format( config["stage"], 'Unknow node "{0}" in pipeline'.format(stage[0])) raise ValueError(config["stage"]) elif not config['params']['pipeline'][stage[0]]['node'] in _node_map: config["stage"] = '{0}/n{1}'.format( config["stage"], 'Unknow node "{0}" in _node_map'.format( config['params']['pipeline'][stage[0]]['node'])) raise ValueError(config["stage"]) node = _node_map[config['params']['pipeline'][stage[0]]['node']] if node.name == 'read_df': if config.is_train(): df = node(train_csv, config) elif 'args' in config['params']['pipeline'][stage[0]] \ and len(config['params']['pipeline'][stage[0]]['args'])!=0: node.function(df, config, **config['params']['pipeline'][stage[0]]['args']) else: node(df, config) stage_time_inc(config, start_time, stage[0])
def feature_selection(df: pd.DataFrame, config: Config): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: return selected_columns = [] config_sample = copy.deepcopy(config) config.limit_time_fraction(0.1) for i in range(20): if config.is_time_fraction_limit(): break df_sample = df.sample(min(3000, len(df)), random_state=i).copy() transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) if len(selected_columns) > 0: X = X.drop(selected_columns, axis=1) if len(X.columns) > 0: selected_columns += select_features(X, y, config["mode"]) else: break Log.print("Selected columns: {}".format(selected_columns)) drop_number_columns = [c for c in df if c.startswith("number_") and c not in selected_columns] if len(drop_number_columns) > 0: config["drop_number_columns"] = drop_number_columns config["date_columns"] = {} for c in [c for c in selected_columns if c.startswith("datetime_")]: d = c.split("_") date_col = d[0] + "_" + d[1] date_part = d[2] if date_col not in config["date_columns"]: config["date_columns"][date_col] = [] config["date_columns"][date_col].append(date_part) drop_datetime_columns = [c for c in df if c.startswith("datetime_") and c not in config["date_columns"]] if len(drop_datetime_columns) > 0: config["drop_datetime_columns"] = drop_datetime_columns if "drop_number_columns" in config: Log.print("Drop number columns: {}".format(config["drop_number_columns"])) df.drop(config["drop_number_columns"], axis=1, inplace=True) if "drop_datetime_columns" in config: Log.print("Drop datetime columns: {}".format(config["drop_datetime_columns"])) df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(test_csv, self.config) result = { "line_id": list(df["line_id"]), "prediction": [], } def chunker(seq, size): return (seq[pos:pos+size] for pos in range(0, len(seq), size)) for chunk in chunker(df, 100000): X = chunk.copy() preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.sort_values("line_id", inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score def save(self): self.config.save() def load(self): self.config.load()
def __init__(self, model_dir: str, params: dict, verbose: int=0): self.config = Config(model_dir, params, verbose=verbose) self.verbose=verbose if not 'memory' in self.config['params']: self.config['params']['memory'] = {} if not 'max_size_mb' in self.config['params']['memory']: self.config['params']['memory']['max_size_mb'] = 2 if not 'max_size_train_samples' in self.config['params']['memory']: self.config['params']['memory']['max_size_train_samples'] = 10000 if not 'field_target_name' in self.config['params']: self.config['params']['field_target_name'] = 'target'
def train_h2o(X: pd.DataFrame, y: pd.Series, config: Config): h2o.init() X["target"] = y train = h2o.H2OFrame(X) train_x = train.columns train_y = "target" train_x.remove(train_y) if config["mode"] == "classification": train[train_y] = train[train_y].asfactor() aml = H2OAutoML(max_runtime_secs=int(config.time_left() * 0.9), max_models=20, nfolds=3, exclude_algos=["GBM", "DeepLearning", "DRF"], seed=42) aml.train(x=train_x, y=train_y, training_frame=train) config['params']['pipeline'][config["stage"]]["model"] = h2o.save_model( model=aml.leader, path=config.model_dir + "/h2o.model", force=True) if config.verbose: print(aml.leaderboard) X.drop("target", axis=1, inplace=True)
def leak_detect(df: pd.DataFrame, config: Config) -> bool: if config.is_predict(): return "leak" in config id_cols = [c for c in df if c.startswith('id_')] dt_cols = [c for c in df if c.startswith('datetime_')] if id_cols and dt_cols: num_cols = [c for c in df if c.startswith('number_')] for id_col in id_cols: group = df.groupby(by=id_col).get_group(df[id_col].iloc[0]) for dt_col in dt_cols: sorted_group = group.sort_values(dt_col) for lag in range(-1, -10, -1): for col in num_cols: corr = sorted_group['target'].corr(sorted_group[col].shift(lag)) if corr >= 0.99: config["leak"] = { "num_col": col, "lag": lag, "id_col": id_col, "dt_col": dt_col, } return True return False
def read_df(csv_path: str, config: Config) -> pd.DataFrame: if "dtype" not in config: preview_df(csv_path, config) df = optimize_dataframe(pandas_read_csv(csv_path, config)) if config.is_train(): config["nrows"] = len(df) return df
def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config): X_train, X_val, y_train, y_val = data_split(X, y, config, test_size=0.5) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) space = { "learning_rate": hp.choice("learning_rate", np.arange(0.01, 0.05, 0.01)), "boost_from_average": hp.choice("boost_from_average", [True, False]), "is_unbalance": hp.choice("is_unbalance", [True, False]), "zero_as_missing": hp.choice("zero_as_missing", [True, False]), "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 7]), "num_leaves": hp.choice("num_leaves", [11, 31, 51, 101, 151, 201]), "feature_fraction": hp.choice("feature_fraction", np.arange(0.5, 1.0, 0.1)), "bagging_fraction": hp.choice("bagging_fraction", np.arange(0.5, 1.0, 0.1)), "bagging_freq": hp.choice("bagging_freq", [1, 3, 5, 10, 20, 50]), "reg_alpha": hp.uniform("reg_alpha", 0, 10), "reg_lambda": hp.uniform("reg_lambda", 0, 10), "min_child_weight": hp.uniform("min_child_weight", 0, 10), } config.limit_time_fraction(0.15) def objective(hyperparams): if config.is_time_fraction_limit(): score = np.inf if config.is_regression() else 0 return {'loss': score, 'status': STATUS_OK} model = lgb.train({**params, **hyperparams}, train_data, 300, valid_data, early_stopping_rounds=100, verbose_eval=False) score = model.best_score["valid_0"][params["metric"]] Log.print(score) if config.is_classification(): score = -score return {'loss': score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=100, verbose=1, rstate= np.random.RandomState(1)) hyperparams = space_eval(space, best) Log.print("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams)) return hyperparams
def read_df(csv_path: str, config: Config) -> pd.DataFrame: if "dtype" not in config: preview_df(csv_path, config) df = pandas_read_csv(csv_path, config) if config.is_train(): config["nrows"] = len(df) config["target_data"] = df['target'].copy() return df
def read_df(csv_path: str, config: Config) -> pd.DataFrame: if "dtype" not in config: preview_df(csv_path, config) df = pandas_read_csv(csv_path, config) if config.is_train(): config["nrows_stage_nb"] = 0 config["nrows"] = len(df) return df
def main(parse): # load configs dconf_path = 'config/data.json' mconf_path = 'config/word2vec.json' dconf = Config(dconf_path) mconf = Config(mconf_path) # load w2v model and train if mconf.model == 'cbow': w2v = CbowModel(dconf, mconf) else: w2v = SkipGramModel(dconf, mconf) w2v.load('trained.pth') # test w2v word = 'hospital' print(w2v.nearest(word)) print(w2v.similarity(word, 'attacks').item()) print(w2v.similarity(word, word).item())
def feature_selection(df: pd.DataFrame, config: Config): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: return selected_columns = [] for i in range(3): config_sample = copy.deepcopy(config) df_sample = df.sample(frac=0.05, random_state=i).copy(deep=True) df_sample = preprocess_pipeline(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) if len(X.columns) > 0: selected_columns += select_features(X, y, config["mode"]) else: break df_size_mb = df.drop( list(set(df) - set(selected_columns)), 1, errors='ignore').memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: break selected_columns = list(set(selected_columns)) log("Selected columns: {}".format(selected_columns)) drop_number_columns = [ c for c in df if (c.startswith("number_") or c.startswith("id_")) and c not in selected_columns ] if len(drop_number_columns) > 0: config["drop_number_columns"] = drop_number_columns drop_datetime_columns = [ c for c in df if c.startswith("datetime_") and c not in selected_columns ] if len(drop_datetime_columns) > 0: config["drop_datetime_columns"] = drop_datetime_columns if "drop_number_columns" in config: log("Drop number columns: {}".format(config["drop_number_columns"])) df.drop(config["drop_number_columns"], axis=1, inplace=True) if "drop_datetime_columns" in config: log("Drop datetime columns: {}".format( config["drop_datetime_columns"])) df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
def main(args): # load configs dconf_path = 'config/data.json' mconf_path = 'config/word2vec.json' dconf = Config(dconf_path) mconf = Config(mconf_path) # load w2v model and train if mconf.model == 'cbow': w2v = CbowModel(dconf, mconf, args.mode) else: w2v = SkipGramModel(dconf, mconf, args.mode) if args.mode != 'test': w2v.train() w2v.save(dconf.saved_file) # test w2v word = 'hospital' print(w2v.nearest(word)) print(w2v.similarity(word, 'attacks').item()) print(w2v.similarity(word, word).item())
def fillna(df: pd.DataFrame, config: Config, args: dict = {}): if len(args) != 0: for k, v in args.items(): if config.is_train(): lst_columns = [c for c in df if c.startswith(k)] config['stages'][config["stage"]][k] = { 'lst_columns': lst_columns } if len(lst_columns) != 0: if 'agg' in v or 'value' in v: if config.is_train(): s_fillna_values = calc_columns_metric( df, lst_columns, metric=v['agg'] if 'agg' in v else None, value=v['value'] if 'value' in v else None) config['stages'][config["stage"]][k][ 'fillna_values'] = deepcopy(s_fillna_values) if len(config['stages'][config["stage"]][k]['lst_columns']) != 0: fillna_columns( df, config['stages'][config["stage"]][k]['fillna_values']) else: for c in [c for c in df if c.startswith("number_")]: df[c].fillna(-1, inplace=True) for c in [c for c in df if c.startswith("string_")]: df[c].fillna("", inplace=True) for c in [c for c in df if c.startswith("datetime_")]: df[c].fillna(datetime.datetime(1970, 1, 1), inplace=True)
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float=2.0): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb > max_size_mb: mem_per_row = df_size_mb / len(df) sample_rows = int(max_size_mb / mem_per_row) log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows.".format(df_size_mb, len(df), sample_rows)) _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1) df.drop(df_drop.index, inplace=True) config["nrows"] = sample_rows else: config["nrows"] = len(df)
def check_columns_exists(df: pd.DataFrame, config: Config, key_stage: str, drop_columns_test: bool = True): field_target_name = config['params']['field_target_name'] if config.is_train(): if not 'columns_exists' in config['params']['pipeline'][ config["stage"]]: config['params']['pipeline'][ config["stage"]]['columns_exists'] = {} if not field_target_name in df.columns: raise ValueError( 'Column y="{0}" not exists in train dataset'.format( field_target_name)) config['params']['pipeline'][config["stage"]]['columns_exists'][key_stage] = \ set([x for x in df.columns if x!=field_target_name]) elif 'columns_exists' in config['params']['pipeline'][config["stage"]]: if key_stage in config['params']['pipeline'][ config["stage"]]['columns_exists']: set_columns = config['params']['pipeline'][ config["stage"]]['columns_exists'][key_stage] - set(df.columns) if len(set_columns) != 0: raise ValueError( 'Columns "{0}" not exists in test dataset on stage {1}'. format(str(set_columns), key_stage)) set_columns = set(df.columns) - config['params']['pipeline'][ config["stage"]]['columns_exists'][key_stage] if len(set_columns) != 0: if drop_columns_test: df.drop(columns=[x for x in set_columns], inplace=True) else: raise ValueError( 'Columns "{0}" not exists in train dataset on stage {1}' .format(str(set_columns), key_stage)) else: raise ValueError( 'Preprocess stage "{0}" not exists'.format(key_stage))
def scale(df: pd.DataFrame, config: Config): warnings.filterwarnings(action='ignore', category=DataConversionWarning) scale_columns = [ c for c in df if c.startswith("number_") and df[c].dtype != np.int8 and c not in config["categorical_columns"] ] if len(scale_columns) > 0: if config.is_train(): config['stages'][config["stage"]]['scale_columns'] = deepcopy( scale_columns) config['stages'][config["stage"]]['model'] = StandardScaler( copy=False) config['stages'][config["stage"]]['scale_columns'] = deepcopy( scale_columns) config['stages'][config["stage"]]['model'].fit( df[scale_columns].astype(np.float32)) df[config['stages'][config["stage"]]['scale_columns']] = \ config['stages'][config["stage"]]['model'].transform( \ df[config['stages'][config["stage"]]['scale_columns']].astype(np.float32) ).astype(np.float32)
def subsample(df: pd.DataFrame, config: Config): if config.is_train(): # df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 df_size_mb, sample_rows = get_sample_rows(df, config) if df_size_mb > config['params']['memory']['max_size_mb']: # mem_per_row = df_size_mb / len(df) # sample_rows = int(config['params']['memory']['max_size_mb'] / mem_per_row) log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows." \ .format(df_size_mb, len(df), sample_rows), config.verbose) _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1) df.drop(df_drop.index, inplace=True) config["nrows"] = sample_rows elif config["nrows_stage_nb"] == 0: config["nrows"] = max(sample_rows, len(df)) else: config["nrows"] = min(sample_rows, config["nrows"]) config["nrows_stage_nb"] = config["stage_nb"]
import pickle from lib.util import Config from lib.kor2eng import LangTranslator from lib.util import load_data from lib.data_preprocess import Vocab, preprocessor from lib.model.seq2seq import BiLSTMSeq2Seq from transformers.lib.model.transformer import Transformer file_dir = os.path.dirname(__file__) sys.path.append(file_dir) # load configs dconf_path = 'config/data.json' mconf_path = 'config/lm.json' dconf = Config(dconf_path) mconf = Config(mconf_path) device = torch.device("cuda") if torch.cuda.is_available() else torch.device( "cpu") print('Using device:', device) # try: # with open('preprocessed_data.pickle', 'rb') as f: # saved_obj = pickle.load(f) # ko_corpus, ko_vocab, en_corpus, en_vocab = saved_obj # except: # # # load & preprocess corpus # ko_corpus = preprocessor(load_data(dconf.train_ko_path), lang='ko') # en_corpus = preprocessor(load_data(dconf.train_en_path), lang='en')
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config): params = { "objective": "regression" if config["mode"] == "regression" else "binary", "metric": "rmse" if config["mode"] == "regression" else "auc", "verbosity": -1, "seed": 1, } X_sample, y_sample = data_sample(X, y) hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config) for i in range(1): print( '################################################################## cv ' + str(i)) t1_bagging = time.time() params['seed'] = i + 1 # cv nfold = 5 if config["mode"] == 'classification': skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=777) else: skf = KFold(n_splits=nfold, shuffle=True, random_state=777) skf_split = skf.split(X, y) log('####################################################################### begin cv' ) log('####### cur time = ' + str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) score_list = [] config["model"] = [] for fid, (train_idx, valid_idx) in enumerate(skf_split): t1_cv = time.time() print("FoldID:{}".format(fid)) X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx] dtrain = lgb.Dataset(X_train, label=y_train) dvalid = lgb.Dataset(X_valid, label=y_valid, reference=dtrain) cur_model = lgb.train({ **params, **hyperparams }, dtrain, 3000, dvalid, early_stopping_rounds=50, verbose_eval=100) config["model"].append(cur_model) score_list.append(cur_model.best_score) # gc.collect() sys.stdout.flush() t2_cv = time.time() time_left = config.time_left() print('######### cv' + str(time_left)) if (t2_cv - t1_cv) * (nfold - fid + 1) >= time_left: pass #break log('######################################################################### end cv' ) log('####### cur time = ' + str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) valid_auc = np.array( [i['valid_0'][params['metric']] for i in score_list]) print('valid', valid_auc, np.mean(valid_auc)) cv_score = pd.DataFrame( {'cv': np.hstack([valid_auc, np.mean(valid_auc)])}) path = config['path_pred'] print(path) cv_score.to_csv(path + '/cv_score_' + str(i) + '.csv', index=False) t2_bagging = time.time() time_left = config.time_left() print('#########bagging' + str(time_left)) if (t2_bagging - t1_bagging) * 1.5 >= time_left: #break pass
def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir)
def time_series_detect(df: pd.DataFrame, config: Config): sample_size = 10000 model_params = { "objective": "regression" if config["mode"] == "regression" else "binary", "metric": "rmse" if config["mode"] == "regression" else "auc", "learning_rate": 0.01, "verbosity": -1, "seed": 1, "max_depth": -1, } if config.is_train(): datetime_columns = [c for c in df if c.startswith("datetime_")] id_columns = [c for c in df if c.startswith("id_")] sort_columns = [] for dc in datetime_columns: sort_columns.append([dc]) for ic in id_columns: sort_columns.append([ic, dc]) else: for ic in id_columns: sort_columns.append([ic]) scores = [] config.limit_time_fraction(0.1) for sc in sort_columns: if config.is_time_fraction_limit(): break Log.silent(True) df.sort_values(sc, inplace=True) config_sample = copy.deepcopy(config) df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]] shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None) transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5) model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test), early_stopping_rounds=100, verbose_eval=False) score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]] sampled_columns = [c for c in X if "_shift" not in c] model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test), early_stopping_rounds=100, verbose_eval=False) score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]] if config.is_classification(): score_sorted = -score_sorted score_sampled = -score_sampled Log.silent(False) Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled)) score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled) if score_ratio >= 1.03: Log.print(score_ratio) scores.append((score_sorted, sc)) if len(scores) > 0: scores = sorted(scores, key=lambda x: x[0]) Log.print("Scores: {}".format(scores)) config["sort_values"] = scores[0][1] df.sort_values(config["sort_values"], inplace=True) config_sample = copy.deepcopy(config) df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None) transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000) fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns) fi = fi[fi > 0].sort_values() selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist() selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c] if len(selected_shift_columns) > 0: Log.print("Shift columns: {}".format(selected_shift_columns)) config["shift_columns"] = selected_shift_columns if "shift_columns" in config: shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])
import torch import pickle from lib.util import Config from lib.kor2eng import LangTranslator from lib.util import load_data from lib.data_preprocess import Vocab, preprocessor from lib.model.seq2seq import BiLSTMSeq2Seq from transformers.lib.model.transformer import Transformer # load configs dconf_path = 'config/data.json' mconf_path = 'config/lm.json' dconf = Config(dconf_path) mconf = Config(mconf_path) device = torch.device("cuda") if torch.cuda.is_available() else torch.device( "cpu") print('Using device:', device) with open('preprocessed_data.pickle', 'rb') as f: saved_obj = pickle.load(f) _, ko_vocab, _, en_vocab = saved_obj # define lm model if mconf.model == 'seq2seq': model = BiLSTMSeq2Seq( len(ko_vocab) + 1, len(en_vocab) + 1, mconf.emb_dim, mconf.d_m) elif mconf.model == 'transformers': model = Transformer(mconf.d_m,
def train_lightgbm(X: pd.DataFrame, y: pd.Series, stored_models_key: str, save_to_disk: bool, config: Config): config[stored_models_key] = [] data = lgb.Dataset(X, label=y, free_raw_data=False) data.construct() gc.collect() params = { "objective": config["objective"], "metric": config["metric"], "seed": config["seed"], 'num_threads': config['n_threads'], "verbosity": -1, } seed = config["seed"] space = { "learning_rate": hp.uniform("learning_rate", 0.01, 0.4), "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 10]), "num_leaves": hp.choice("num_leaves", np.linspace(4, 200, 50, dtype=int)), "feature_fraction": hp.quniform("feature_fraction", 0.1, 1., 0.1), "bagging_fraction": hp.quniform("bagging_fraction", 0.1, 1., 0.1), "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 20, 10, dtype=int)), "reg_alpha": hp.uniform("reg_alpha", 0, 30), "reg_lambda": hp.uniform("reg_lambda", 0, 30), "min_child_weight": hp.uniform('min_child_weight', 1e-10, 20), "max_bin": hp.choice('max_bin', [50, 100, 255]), 'boosting_type': hp.choice( 'boosting_type', [ { 'boosting_type': 'gbdt', }, { 'boosting_type': 'dart', 'drop_rate': hp.uniform('drop_rate', 0.01, 0.6), 'max_drop': hp.choice( "max_drop", np.linspace(5, config["train_num_boost_round"] * .9, 10, dtype=int)), 'skip_drop': hp.uniform('skip_drop', 0.1, 0.7), }, # {'boosting_type': 'rf', # 'bagging_freq': 1, # }, # {'boosting_type': 'goss', # 'bagging_freq': 0, # }, ]), #train params 'early_stopping_rounds': hp.choice("early_stopping_rounds", [None, 50]), 'cv_splits': hp.choice("cv_splits", np.linspace(3, 12, 10, dtype=int)), # [4,8] 'shuffle': hp.choice("shuffle", [True, False]), } if config.is_classification(): space['scale_pos_weight'] = hp.uniform('scale_pos_weight', 0.5, 10) else: space['objective'] = hp.choice( "objective", [ 'regression', 'huber', # 'fair', # 'regression_l1', ]) def objective(space_sample): iteration_start = time.time() hyperparams = copy.deepcopy(space_sample) boosting_type = {} if 'boosting_type' in hyperparams.keys(): boosting_type = hyperparams.pop('boosting_type') hyperparams = {**params, **hyperparams, **boosting_type} scores, models, y_oof = train_lightgbm_cv(data=data, hyperparams=hyperparams, config=config) if config.is_classification(): scores['oof'] = -scores['oof'] iteration_time = time.time() - iteration_start log('iteration time %.1f, loss %.5f' % (iteration_time, scores['oof'])) elapsed_time = (time.time() - config['start_time']) have_time = (config["time_limit"] - elapsed_time - iteration_time) > 25 if have_time: save_model(models, hyperparams, scores, y_oof, stored_models_key, save_to_disk, config) status = STATUS_OK else: status = STATUS_FAIL return { 'loss': scores['oof'], 'runtime': iteration_time, 'scores': scores, 'models': models, 'y_oof': y_oof, 'status': status } have_time = True eval_n = 0 trials = Trials() while have_time: iteration_start = time.time() best = hyperopt.fmin( fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=eval_n + 1, verbose=1, rstate=np.random.RandomState(eval_n) ) #TODO: (bug) if seed the same - in some cases it samples same values forever iteration_time = time.time() - iteration_start elapsed_time = (time.time() - config['start_time']) have_time = (config["time_limit"] - elapsed_time - iteration_time) > 25 eval_n += 1
class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config["model"] = {} self.config["ensemble"] = {"lgb": 1} self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) # load holiday path_holiday = './holiday.csv' holiday = pd.read_csv(path_holiday, \ encoding='utf-8', low_memory=False, dtype={'holiday':str})['holiday'].values self.config['holiday'] = set(holiday) df = read_df(train_csv, self.config) print(df.shape) holiday_detect(df, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = {"line_id": [], "prediction": []} if 'holiday_detect' in self.config: result["datetime"] = [] for X in pd.read_csv(test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"]): result["line_id"] += list(X["line_id"]) if 'holiday_detect' in self.config: dt_fea = self.config['holiday_detect'] result["datetime"] += list(X[dt_fea]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) # post process for holiday if 'holiday_detect' in self.config: holiday = self.config['holiday'] for idx, row in result.iterrows(): dt = row['datetime'] dt_str = str(dt).split(' ')[0].strip() if dt_str in holiday or dt.weekday() == 5 or dt.weekday() == 6: result.loc[idx, 'prediction'] = 0 result.drop(["datetime"], axis=1, inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score @timeit def save(self): self.config.save() @timeit def load(self): self.config.load()
def non_negative_target_detect(df: pd.DataFrame, config: Config): if config.is_train(): config["non_negative_target"] = df["target"].lt(0).sum() == 0
class AutoML: def __init__(self, model_dir: str): os.makedirs(model_dir, exist_ok=True) self.config = Config(model_dir) def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config[ "objective"] = "regression" if mode == "regression" else "binary" self.config["metric"] = "rmse" if mode == "regression" else "auc" self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) df = preprocess(df, self.config) y = df["target"].copy() X = df.drop("target", axis=1).copy() del df gc.collect() self.config["columns"] = list(X) train(X, y, self.config) def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) self.config["prediction_csv"] = prediction_csv self.config["line_id"] = [] self.config["start_time"] = time.time() result = { "line_id": [], "prediction": [], } X = pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], ) self.config["line_id"] = X["line_id"].values result["line_id"] = (X["line_id"].values) X = preprocess(X, self.config) X = X[self.config["columns"]] # for right columns order result["prediction"] = predict(X, self.config) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"], self.config) else: score = None return result, score @timeit def save(self): self.config.save() @timeit def load(self): self.config.load()
class AutoML: def __init__(self, model_dir: str, params: dict, verbose: int=0): self.config = Config(model_dir, params, verbose=verbose) self.verbose=verbose if not 'memory' in self.config['params']: self.config['params']['memory'] = {} if not 'max_size_mb' in self.config['params']['memory']: self.config['params']['memory']['max_size_mb'] = 2 if not 'max_size_train_samples' in self.config['params']['memory']: self.config['params']['memory']['max_size_train_samples'] = 10000 if not 'field_target_name' in self.config['params']: self.config['params']['field_target_name'] = 'target' @timeit def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config['stages_time'] = {} self.config.tmp_dir = os.path.join(self.config.model_dir, "tmp") os.makedirs(self.config.tmp_dir, exist_ok=True) start_time = time.time() df = read_df(train_csv, self.config) stage_time_inc(self.config, start_time, 'train read_df') pipeline(df, self.config) if self.config.verbose: self.stages_time_print() @timeit def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.join(os.path.dirname(prediction_csv), "tmp") self.config['stages_time'] = {} os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "line_id": [], "prediction": [], } start_time = time.time() for X in pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"] ): stage_time_inc(self.config, start_time, 'test pd.read_csv') result["line_id"] += list(X["line_id"]) pipeline(X, self.config) result["prediction"] += list(X[self.config['graph'][-1][0]]) start_time = time.time() result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) stage_time_inc(self.config, start_time, 'result.to_csv') target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): start_time = time.time() score = validate(result, target_csv, self.config["mode"], self.config.verbose) stage_time_inc(self.config, start_time, 'validate') else: score = None if self.config.verbose: self.stages_time_print() return result, score def stages_time_print(self, sort_by_time=True): if 'stages_time' in self.config.data.keys(): d = self.config['stages_time'] print('\n','-'*3, 'Pipeline stages time, sec:','-'*3) l_just = max([len(x) for x in d.keys()]) + 4 if sort_by_time: for k, v in [(k, d[k]) for k in sorted(d, key=d.get, reverse=True)]: print(k.replace('\n', '_').ljust(l_just), '{:<10} {:.2f}'.format(' ', v)) else: for k, v in self.config['stages_time'].items(): print(k.replace('\n', '_').ljust(l_just), '{:<10} {:.2f}'.format(' ', v)) print('-'*34, '\n') def pipeline_draw(self, file_name='AutoML_pipeline.gv', view=False): g = Digraph('G', filename=file_name) for i in self.config['graph']: g.edge(i[0], i[1]) if view: g.view() return g @timeit def save(self): self.config.save() @timeit def load(self): self.config.load() self.config.verbose = self.verbose