Beispiel #1
0
def target_mean(df_train, df_test):
    df_train = df_train.drop(columns=['company_id', 'obs_date'])
    df_test = df_test.drop(columns=['company_id', 'obs_date'])

    encoder = CategoryEncoder()
    cols = list(df_train.columns)
    discrete_cols = [x for x in cols if x[0] == 'd']
    encoder.fit(df=df_train, y='label', targets=discrete_cols, configurations=[('target', {'smoothing': 0.5})])

    transformed_df_train = encoder.transform(df_train, y='label')
    transformed_df_test = encoder.transform(df_test)

    df_1, df_2 = transformed_df_train.drop(columns = discrete_cols), transformed_df_test.drop(columns=discrete_cols)
    cols_ = df_1.columns
    for col in cols_:
        df_1[col] = df_1[col].fillna(df_1[col].mean())
        df_2[col] = df_2[col].fillna(df_2[col].mean())
    ind_cols = get_ind_col(df_1)
    return df_1[ind_cols], df_2[ind_cols]
Beispiel #2
0
def get_ind_cols(df):
    return get_ind_col(df)
def permutate_selector(train_df, eval_df, y, variables=None, metric='acc', **kwargs):  # TODO Add more metric

    """
    Return the importance of variables based on permutation loss

    :param train_df: training data set
    :param eval_df: eval data set
    :param y: name of the target variable
    :param variables: the variables to select perform the select; if None, then all the variables except target variable will be selected
    :param metric: the metric to determine the order; higher value indicate better performance
    :param **kwargs: argument for logistic regression

    returns: result after permutation
    """

    @ray.remote()
    def fit_and_predict(train_df, eval_df, y, variables, metric, start=None, **kwargs):
        if start is None:
            clf = LogisticRegression(**kwargs)
        else:
            clf = LogisticRegression(warm_start=start, **kwargs)
        clf.fit(train_df[variables], train_df[y])
        y_pred = clf.predict(eval_df[variables])

        if metric == 'acc':  # TODO Add more metric
            score = accuracy_score(eval_df[y], y_pred)
        else:
            score = None
        return score, clf.coef_

    @ray.remote()
    def fit_permute_and_predict(train_df, eval_df, y, variables, metric, start, permute_var, **kwargs):
        train_df[permute_var] = np.random.permutation(train_df[permute_var])
        score, _ = fit_and_predict(train_df, eval_df, y, variables, metric, start, **kwargs)
        return (permute_var, score)

    ray.init()

    ind_col = get_ind_col(train_df)
    if variables is not None:
        var_to_use = [x for x in ind_col if x in variables]
    else:
        var_to_use = [x for x in ind_col if x != y]

    result_dict = dict()

    score, warm_start = fit_and_predict(train_df, eval_df, y, var_to_use, metric, None, **kwargs)
    result_dict['origin'] = score

    train_df_id = ray.put(train_df)
    eval_df_id = ray.put(eval_df)

    var_to_use_id = ray.put(var_to_use)
    start_id = ray.put(warm_start)
    result = [
        fit_permute_and_predict.remote(train_df_id, eval_df_id, y, var_to_use_id, start_id, permute_var, **kwargs, ) for
        permute_var in var_to_use]
    result_list = ray.get(result)

    for var, score in result_list:
        result_dict[var] = score
    ray.shutdown()
    return result_dict