def train(model_dict, dfX, cols_family, post_process_fun): """ Train the model using model_dict, save model, save prediction :param model_dict: dict containing params :param dfX: pd.DataFrame :param cols_family: dict of list containing column names :param post_process_fun: :return: dfXtrain , dfXval DataFrame containing prediction. """ model_pars, compute_pars = model_dict['model_pars'], model_dict[ 'compute_pars'] data_pars = model_dict['data_pars'] model_name, model_path = model_pars['model_class'], model_dict[ 'global_pars']['path_train_model'] metric_list = compute_pars['metric_list'] assert 'cols_model_type2' in data_pars, 'Missing cols_model_type2, split of columns by data type ' log2(data_pars['cols_model_type2']) log("#### Model Input preparation ##################################################" ) log2(dfX.shape) dfX = dfX.sample(frac=1.0) itrain = int(0.6 * len(dfX)) ival = int(0.8 * len(dfX)) colsX = data_pars['cols_model'] coly = data_pars['coly'] log2('Model colsX', colsX) log2('Model coly', coly) log2('Model column type: ', data_pars['cols_model_type2']) ### Only Parameters data_pars_ref = copy.deepcopy(data_pars) #### TODO : Lazy Dict to have large dataset data_pars['data_type'] = 'ram' data_pars['train'] = { 'Xtrain': dfX[colsX].iloc[:itrain, :], 'ytrain': dfX[coly].iloc[:itrain], 'Xtest': dfX[colsX].iloc[itrain:ival, :], 'ytest': dfX[coly].iloc[itrain:ival], 'Xval': dfX[colsX].iloc[ival:, :], 'yval': dfX[coly].iloc[ival:], } log("#### Init, Train ############################################################" ) # from config_model import map_model modelx = map_model(model_name) log2(modelx) modelx.reset() ### data_pars_ref has NO data. modelx.init(model_pars, data_pars=data_pars_ref, compute_pars=compute_pars) ### Using Actual daa in data_pars['train'] modelx.fit(data_pars, compute_pars) log("#### Predict ################################################################" ) ypred, ypred_proba = modelx.predict(dfX[colsX], data_pars=data_pars_ref, compute_pars=compute_pars) dfX[coly + '_pred'] = ypred # y_norm(ypred, inverse=True) dfX[coly] = dfX[coly].apply(lambda x: post_process_fun(x)) dfX[coly + '_pred'] = dfX[coly + '_pred'].apply(lambda x: post_process_fun(x)) if ypred_proba is None: ### No proba ypred_proba_val = None elif len(ypred_proba.shape) <= 1: #### Single dim proba ypred_proba_val = ypred_proba[ival:] dfX[coly + '_proba'] = ypred_proba elif len(ypred_proba.shape) > 1: ## Muitple proba from util_feature import np_conv_to_one_col ypred_proba_val = ypred_proba[ival:, :] dfX[coly + '_proba'] = np_conv_to_one_col( ypred_proba, ";") ### merge into string "p1,p2,p3,p4" log(dfX.head(3).T) log2("Actual : ", dfX[coly]) log2("Prediction: ", dfX[coly + '_pred']) log("#### Metrics ###############################################################" ) from util_feature import metrics_eval metrics_test = metrics_eval(metric_list, ytrue=dfX[coly].iloc[ival:], ypred=dfX[coly + '_pred'].iloc[ival:], ypred_proba=ypred_proba_val) stats = {'metrics_test': metrics_test} log(stats) log("### Saving model, dfX, columns #############################################" ) log2(model_path + "/model.pkl") os.makedirs(model_path, exist_ok=True) save(colsX, model_path + "/colsX.pkl") save(coly, model_path + "/coly.pkl") modelx.save(model_path, stats) log("### Reload model, ###############################################" ) log2(modelx.model.model_pars, modelx.model.compute_pars) modelx = map_model(model_name) modelx.load_model(model_path) log("Reload model pars", modelx.model.model_pars) log2("Reload model", modelx.model) return dfX.iloc[:ival, :].reset_index(), dfX.iloc[ ival:, :].reset_index(), stats
def train(model_dict, dfX, cols_family, post_process_fun): """ Train the model using model_dict, save model, save prediction :param model_dict: dict containing params :param dfX: pd.DataFrame :param cols_family: dict of list containing column names :param post_process_fun: :return: dfXtrain , dfXval DataFrame containing prediction. """ model_pars, compute_pars = model_dict['model_pars'], model_dict[ 'compute_pars'] data_pars = model_dict['data_pars'] model_name, model_path = model_pars['model_class'], model_dict[ 'global_pars']['path_train_model'] metric_list = compute_pars['metric_list'] log("#### Data preparation #########################################################" ) log(dfX.shape) dfX = dfX.sample(frac=1.0) itrain = int(0.6 * len(dfX)) ival = int(0.8 * len(dfX)) colsX = data_pars['cols_model'] coly = data_pars['coly'] log('Model colsX', colsX) log('Model coly', coly) data_pars['data_type'] = 'ram' data_pars['train'] = { 'Xtrain': dfX[colsX].iloc[:itrain, :], 'ytrain': dfX[coly].iloc[:itrain], 'Xtest': dfX[colsX].iloc[itrain:ival, :], 'ytest': dfX[coly].iloc[itrain:ival], 'Xval': dfX[colsX].iloc[ival:, :], 'yval': dfX[coly].iloc[ival:], } log("#### Init, Train ############################################################" ) # from config_model import map_model modelx = map_model(model_name) log(modelx) modelx.reset() modelx.init(model_pars, compute_pars=compute_pars) if 'optuna' in model_name: modelx.fit(data_pars, compute_pars) # No need anymore # modelx.model.model_pars['optuna_model'] = modelx.fit(data_pars, compute_pars) else: modelx.fit(data_pars, compute_pars) log("#### Predict ################################################################" ) ypred, ypred_proba = modelx.predict(dfX[colsX], compute_pars=compute_pars) dfX[coly + '_pred'] = ypred # y_norm(ypred, inverse=True) dfX[coly] = dfX[coly].apply(lambda x: post_process_fun(x)) dfX[coly + '_pred'] = dfX[coly + '_pred'].apply(lambda x: post_process_fun(x)) if ypred_proba is None: ypred_proba_val = None elif len(ypred_proba.shape) <= 1: ypred_proba_val = ypred_proba[ival:] dfX[coly + '_proba'] = ypred_proba elif len(ypred_proba.shape) > 1: from util_feature import np_conv_to_one_col ypred_proba_val = ypred_proba[ival:, :] dfX[coly + '_proba'] = np_conv_to_one_col( ypred_proba, ";") ### merge into string "p1,p2,p3,p4" log(dfX.head(3).T) log("Actual : ", dfX[coly]) log("Prediction: ", dfX[coly + '_pred']) log("#### Metrics #############################################################" ) from util_feature import metrics_eval metrics_test = metrics_eval(metric_list, ytrue=dfX[coly].iloc[ival:], ypred=dfX[coly + '_pred'].iloc[ival:], ypred_proba=ypred_proba_val) stats = {'metrics_test': metrics_test} log(stats) log("### Saving model, dfX, columns ###########################################" ) log(model_path + "/model.pkl") os.makedirs(model_path, exist_ok=True) save(colsX, model_path + "/colsX.pkl") save(coly, model_path + "/coly.pkl") modelx.save(model_path, stats) log("### Reload model, ############################################" ) log(modelx.model.model_pars, modelx.model.compute_pars) a = load(model_path + "/model.pkl") log("Reload model pars", a.model_pars) return dfX.iloc[:ival, :].reset_index(), dfX.iloc[ival:, :].reset_index()
def train(model_dict, dfX, cols_family, post_process_fun): """ Train the model using model_dict, save model, save prediction :param model_dict: dict containing params :param dfX: pd.DataFrame :param cols_family: dict of list containing column names :param post_process_fun: :return: dfXtrain , dfXval DataFrame containing prediction. """ model_pars, compute_pars = model_dict['model_pars'], model_dict[ 'compute_pars'] data_pars = model_dict['data_pars'] model_name, model_path = model_pars['model_class'], model_dict[ 'global_pars']['path_train_model'] metric_list = compute_pars['metric_list'] assert 'cols_model_type2' in data_pars, 'Missing cols_model_type2, split of columns by data type ' log2(data_pars['cols_model_type2']) log("#### Model Input preparation #########################################################" ) log(dfX.shape) dfX = dfX.sample(frac=1.0) itrain = int(0.6 * len(dfX)) ival = int(0.8 * len(dfX)) colsX = data_pars['cols_model'] coly = data_pars['coly'] log('Model colsX', colsX) log('Model coly', coly) log('Model column type: ', data_pars['cols_model_type2']) data_pars['data_type'] = 'ram' data_pars['train'] = { 'Xtrain': dfX[colsX].iloc[:itrain, :], 'ytrain': dfX[coly].iloc[:itrain], 'Xtest': dfX[colsX].iloc[itrain:ival, :], 'ytest': dfX[coly].iloc[itrain:ival], 'Xval': dfX[colsX].iloc[ival:, :], 'yval': dfX[coly].iloc[ival:], } log("#### Init, Train ############################################################" ) # from config_model import map_model modelx = map_model(model_name) log(modelx) modelx.reset() modelx.init(model_pars, compute_pars=compute_pars) if 'optuna' in model_name: modelx.fit(data_pars, compute_pars) # No need anymore # modelx.model.model_pars['optuna_model'] = modelx.fit(data_pars, compute_pars) else: modelx.fit(data_pars, compute_pars) log("#### Transform ################################################################" ) dfX2 = modelx.transform(dfX[colsX], compute_pars=compute_pars) dfX2.index = dfX.index for coli in dfX2.columns: dfX2[coli] = dfX2[coli].apply(lambda x: post_process_fun(x)) log("Actual : ", dfX[colsX]) log("Prediction: ", dfX2) log("#### Metrics ###############################################################" ) from util_feature import metrics_eval metrics_test = metrics_eval(metric_list, ytrue=dfX[coly].iloc[ival:], ypred=dfX[coly + '_pred'].iloc[ival:], ypred_proba=ypred_proba_val) stats = {'metrics_test': metrics_test} log(stats) log("### Saving model, dfX, columns #############################################" ) log(model_path + "/model.pkl") os.makedirs(model_path, exist_ok=True) save(colsX, model_path + "/colsX.pkl") save(coly, model_path + "/coly.pkl") modelx.save(model_path, stats) log("### Reload model, ###############################################" ) log(modelx.model.model_pars, modelx.model.compute_pars) a = load(model_path + "/model.pkl") log("Reload model pars", a.model_pars) return dfX2.iloc[:ival, :].reset_index(), dfX2.iloc[ ival:, :].reset_index(), stats