Exemple #1
0
def train(model_dict, dfX, cols_family, post_process_fun):
    """  Train the model using model_dict, save model, save prediction
    :param model_dict:  dict containing params
    :param dfX:  pd.DataFrame
    :param cols_family: dict of list containing column names
    :param post_process_fun:
    :return: dfXtrain , dfXval  DataFrame containing prediction.
    """
    model_pars, compute_pars = model_dict['model_pars'], model_dict[
        'compute_pars']
    data_pars = model_dict['data_pars']
    model_name, model_path = model_pars['model_class'], model_dict[
        'global_pars']['path_train_model']
    metric_list = compute_pars['metric_list']

    assert 'cols_model_type2' in data_pars, 'Missing cols_model_type2, split of columns by data type '
    log2(data_pars['cols_model_type2'])

    log("#### Model Input preparation ##################################################"
        )
    log2(dfX.shape)
    dfX = dfX.sample(frac=1.0)
    itrain = int(0.6 * len(dfX))
    ival = int(0.8 * len(dfX))
    colsX = data_pars['cols_model']
    coly = data_pars['coly']
    log2('Model colsX', colsX)
    log2('Model coly', coly)
    log2('Model column type: ', data_pars['cols_model_type2'])

    ### Only Parameters
    data_pars_ref = copy.deepcopy(data_pars)

    #### TODO : Lazy Dict to have large dataset
    data_pars['data_type'] = 'ram'
    data_pars['train'] = {
        'Xtrain': dfX[colsX].iloc[:itrain, :],
        'ytrain': dfX[coly].iloc[:itrain],
        'Xtest': dfX[colsX].iloc[itrain:ival, :],
        'ytest': dfX[coly].iloc[itrain:ival],
        'Xval': dfX[colsX].iloc[ival:, :],
        'yval': dfX[coly].iloc[ival:],
    }

    log("#### Init, Train ############################################################"
        )
    # from config_model import map_model
    modelx = map_model(model_name)
    log2(modelx)
    modelx.reset()
    ###  data_pars_ref has NO data.
    modelx.init(model_pars, data_pars=data_pars_ref, compute_pars=compute_pars)

    ### Using Actual daa in data_pars['train']
    modelx.fit(data_pars, compute_pars)

    log("#### Predict ################################################################"
        )
    ypred, ypred_proba = modelx.predict(dfX[colsX],
                                        data_pars=data_pars_ref,
                                        compute_pars=compute_pars)

    dfX[coly + '_pred'] = ypred  # y_norm(ypred, inverse=True)

    dfX[coly] = dfX[coly].apply(lambda x: post_process_fun(x))
    dfX[coly + '_pred'] = dfX[coly +
                              '_pred'].apply(lambda x: post_process_fun(x))

    if ypred_proba is None:  ### No proba
        ypred_proba_val = None

    elif len(ypred_proba.shape) <= 1:  #### Single dim proba
        ypred_proba_val = ypred_proba[ival:]
        dfX[coly + '_proba'] = ypred_proba

    elif len(ypred_proba.shape) > 1:  ## Muitple proba
        from util_feature import np_conv_to_one_col
        ypred_proba_val = ypred_proba[ival:, :]
        dfX[coly + '_proba'] = np_conv_to_one_col(
            ypred_proba, ";")  ### merge into string "p1,p2,p3,p4"
        log(dfX.head(3).T)

    log2("Actual    : ", dfX[coly])
    log2("Prediction: ", dfX[coly + '_pred'])

    log("#### Metrics ###############################################################"
        )
    from util_feature import metrics_eval
    metrics_test = metrics_eval(metric_list,
                                ytrue=dfX[coly].iloc[ival:],
                                ypred=dfX[coly + '_pred'].iloc[ival:],
                                ypred_proba=ypred_proba_val)
    stats = {'metrics_test': metrics_test}
    log(stats)

    log("### Saving model, dfX, columns #############################################"
        )
    log2(model_path + "/model.pkl")
    os.makedirs(model_path, exist_ok=True)
    save(colsX, model_path + "/colsX.pkl")
    save(coly, model_path + "/coly.pkl")
    modelx.save(model_path, stats)

    log("### Reload model,            ###############################################"
        )
    log2(modelx.model.model_pars, modelx.model.compute_pars)
    modelx = map_model(model_name)
    modelx.load_model(model_path)
    log("Reload model pars", modelx.model.model_pars)
    log2("Reload model", modelx.model)

    return dfX.iloc[:ival, :].reset_index(), dfX.iloc[
        ival:, :].reset_index(), stats
Exemple #2
0
def train(model_dict, dfX, cols_family, post_process_fun):
    """  Train the model using model_dict, save model, save prediction
    :param model_dict:  dict containing params
    :param dfX:  pd.DataFrame
    :param cols_family: dict of list containing column names
    :param post_process_fun:
    :return: dfXtrain , dfXval  DataFrame containing prediction.
    """
    model_pars, compute_pars = model_dict['model_pars'], model_dict[
        'compute_pars']
    data_pars = model_dict['data_pars']
    model_name, model_path = model_pars['model_class'], model_dict[
        'global_pars']['path_train_model']
    metric_list = compute_pars['metric_list']

    log("#### Data preparation #########################################################"
        )
    log(dfX.shape)
    dfX = dfX.sample(frac=1.0)
    itrain = int(0.6 * len(dfX))
    ival = int(0.8 * len(dfX))
    colsX = data_pars['cols_model']
    coly = data_pars['coly']
    log('Model colsX', colsX)
    log('Model coly', coly)

    data_pars['data_type'] = 'ram'
    data_pars['train'] = {
        'Xtrain': dfX[colsX].iloc[:itrain, :],
        'ytrain': dfX[coly].iloc[:itrain],
        'Xtest': dfX[colsX].iloc[itrain:ival, :],
        'ytest': dfX[coly].iloc[itrain:ival],
        'Xval': dfX[colsX].iloc[ival:, :],
        'yval': dfX[coly].iloc[ival:],
    }

    log("#### Init, Train ############################################################"
        )
    # from config_model import map_model
    modelx = map_model(model_name)
    log(modelx)
    modelx.reset()
    modelx.init(model_pars, compute_pars=compute_pars)

    if 'optuna' in model_name:
        modelx.fit(data_pars, compute_pars)
        # No need anymore
        # modelx.model.model_pars['optuna_model'] = modelx.fit(data_pars, compute_pars)
    else:
        modelx.fit(data_pars, compute_pars)

    log("#### Predict ################################################################"
        )
    ypred, ypred_proba = modelx.predict(dfX[colsX], compute_pars=compute_pars)

    dfX[coly + '_pred'] = ypred  # y_norm(ypred, inverse=True)

    dfX[coly] = dfX[coly].apply(lambda x: post_process_fun(x))
    dfX[coly + '_pred'] = dfX[coly +
                              '_pred'].apply(lambda x: post_process_fun(x))

    if ypred_proba is None:
        ypred_proba_val = None

    elif len(ypred_proba.shape) <= 1:
        ypred_proba_val = ypred_proba[ival:]
        dfX[coly + '_proba'] = ypred_proba

    elif len(ypred_proba.shape) > 1:
        from util_feature import np_conv_to_one_col
        ypred_proba_val = ypred_proba[ival:, :]
        dfX[coly + '_proba'] = np_conv_to_one_col(
            ypred_proba, ";")  ### merge into string "p1,p2,p3,p4"
        log(dfX.head(3).T)

    log("Actual    : ", dfX[coly])
    log("Prediction: ", dfX[coly + '_pred'])

    log("#### Metrics #############################################################"
        )
    from util_feature import metrics_eval
    metrics_test = metrics_eval(metric_list,
                                ytrue=dfX[coly].iloc[ival:],
                                ypred=dfX[coly + '_pred'].iloc[ival:],
                                ypred_proba=ypred_proba_val)
    stats = {'metrics_test': metrics_test}
    log(stats)

    log("### Saving model, dfX, columns ###########################################"
        )
    log(model_path + "/model.pkl")
    os.makedirs(model_path, exist_ok=True)
    save(colsX, model_path + "/colsX.pkl")
    save(coly, model_path + "/coly.pkl")
    modelx.save(model_path, stats)

    log("### Reload model,            ############################################"
        )
    log(modelx.model.model_pars, modelx.model.compute_pars)
    a = load(model_path + "/model.pkl")
    log("Reload model pars", a.model_pars)

    return dfX.iloc[:ival, :].reset_index(), dfX.iloc[ival:, :].reset_index()
Exemple #3
0
def train(model_dict, dfX, cols_family, post_process_fun):
    """  Train the model using model_dict, save model, save prediction
    :param model_dict:  dict containing params
    :param dfX:  pd.DataFrame
    :param cols_family: dict of list containing column names
    :param post_process_fun:
    :return: dfXtrain , dfXval  DataFrame containing prediction.
    """
    model_pars, compute_pars = model_dict['model_pars'], model_dict[
        'compute_pars']
    data_pars = model_dict['data_pars']
    model_name, model_path = model_pars['model_class'], model_dict[
        'global_pars']['path_train_model']
    metric_list = compute_pars['metric_list']

    assert 'cols_model_type2' in data_pars, 'Missing cols_model_type2, split of columns by data type '
    log2(data_pars['cols_model_type2'])

    log("#### Model Input preparation #########################################################"
        )
    log(dfX.shape)
    dfX = dfX.sample(frac=1.0)
    itrain = int(0.6 * len(dfX))
    ival = int(0.8 * len(dfX))
    colsX = data_pars['cols_model']
    coly = data_pars['coly']
    log('Model colsX', colsX)
    log('Model coly', coly)
    log('Model column type: ', data_pars['cols_model_type2'])

    data_pars['data_type'] = 'ram'
    data_pars['train'] = {
        'Xtrain': dfX[colsX].iloc[:itrain, :],
        'ytrain': dfX[coly].iloc[:itrain],
        'Xtest': dfX[colsX].iloc[itrain:ival, :],
        'ytest': dfX[coly].iloc[itrain:ival],
        'Xval': dfX[colsX].iloc[ival:, :],
        'yval': dfX[coly].iloc[ival:],
    }

    log("#### Init, Train ############################################################"
        )
    # from config_model import map_model
    modelx = map_model(model_name)
    log(modelx)
    modelx.reset()
    modelx.init(model_pars, compute_pars=compute_pars)

    if 'optuna' in model_name:
        modelx.fit(data_pars, compute_pars)
        # No need anymore
        # modelx.model.model_pars['optuna_model'] = modelx.fit(data_pars, compute_pars)
    else:
        modelx.fit(data_pars, compute_pars)

    log("#### Transform ################################################################"
        )
    dfX2 = modelx.transform(dfX[colsX], compute_pars=compute_pars)
    dfX2.index = dfX.index

    for coli in dfX2.columns:
        dfX2[coli] = dfX2[coli].apply(lambda x: post_process_fun(x))

    log("Actual    : ", dfX[colsX])
    log("Prediction: ", dfX2)

    log("#### Metrics ###############################################################"
        )
    from util_feature import metrics_eval
    metrics_test = metrics_eval(metric_list,
                                ytrue=dfX[coly].iloc[ival:],
                                ypred=dfX[coly + '_pred'].iloc[ival:],
                                ypred_proba=ypred_proba_val)
    stats = {'metrics_test': metrics_test}
    log(stats)

    log("### Saving model, dfX, columns #############################################"
        )
    log(model_path + "/model.pkl")
    os.makedirs(model_path, exist_ok=True)
    save(colsX, model_path + "/colsX.pkl")
    save(coly, model_path + "/coly.pkl")
    modelx.save(model_path, stats)

    log("### Reload model,            ###############################################"
        )
    log(modelx.model.model_pars, modelx.model.compute_pars)
    a = load(model_path + "/model.pkl")
    log("Reload model pars", a.model_pars)

    return dfX2.iloc[:ival, :].reset_index(), dfX2.iloc[
        ival:, :].reset_index(), stats