def target_mean(df_train, df_test): df_train = df_train.drop(columns=['company_id', 'obs_date']) df_test = df_test.drop(columns=['company_id', 'obs_date']) encoder = CategoryEncoder() cols = list(df_train.columns) discrete_cols = [x for x in cols if x[0] == 'd'] encoder.fit(df=df_train, y='label', targets=discrete_cols, configurations=[('target', {'smoothing': 0.5})]) transformed_df_train = encoder.transform(df_train, y='label') transformed_df_test = encoder.transform(df_test) df_1, df_2 = transformed_df_train.drop(columns = discrete_cols), transformed_df_test.drop(columns=discrete_cols) cols_ = df_1.columns for col in cols_: df_1[col] = df_1[col].fillna(df_1[col].mean()) df_2[col] = df_2[col].fillna(df_2[col].mean()) ind_cols = get_ind_col(df_1) return df_1[ind_cols], df_2[ind_cols]
def get_ind_cols(df): return get_ind_col(df)
def permutate_selector(train_df, eval_df, y, variables=None, metric='acc', **kwargs): # TODO Add more metric """ Return the importance of variables based on permutation loss :param train_df: training data set :param eval_df: eval data set :param y: name of the target variable :param variables: the variables to select perform the select; if None, then all the variables except target variable will be selected :param metric: the metric to determine the order; higher value indicate better performance :param **kwargs: argument for logistic regression returns: result after permutation """ @ray.remote() def fit_and_predict(train_df, eval_df, y, variables, metric, start=None, **kwargs): if start is None: clf = LogisticRegression(**kwargs) else: clf = LogisticRegression(warm_start=start, **kwargs) clf.fit(train_df[variables], train_df[y]) y_pred = clf.predict(eval_df[variables]) if metric == 'acc': # TODO Add more metric score = accuracy_score(eval_df[y], y_pred) else: score = None return score, clf.coef_ @ray.remote() def fit_permute_and_predict(train_df, eval_df, y, variables, metric, start, permute_var, **kwargs): train_df[permute_var] = np.random.permutation(train_df[permute_var]) score, _ = fit_and_predict(train_df, eval_df, y, variables, metric, start, **kwargs) return (permute_var, score) ray.init() ind_col = get_ind_col(train_df) if variables is not None: var_to_use = [x for x in ind_col if x in variables] else: var_to_use = [x for x in ind_col if x != y] result_dict = dict() score, warm_start = fit_and_predict(train_df, eval_df, y, var_to_use, metric, None, **kwargs) result_dict['origin'] = score train_df_id = ray.put(train_df) eval_df_id = ray.put(eval_df) var_to_use_id = ray.put(var_to_use) start_id = ray.put(warm_start) result = [ fit_permute_and_predict.remote(train_df_id, eval_df_id, y, var_to_use_id, start_id, permute_var, **kwargs, ) for permute_var in var_to_use] result_list = ray.get(result) for var, score in result_list: result_dict[var] = score ray.shutdown() return result_dict