def make_features(self, df_train_input, df_test_input): df_train_features = pd.DataFrame() df_test_features = pd.DataFrame() folds_train = self._download_from_gs( feather_file_name="StratifiedGroupKFold_training.ftr") category_columns = [ "language", "engaged_user_id", "engaging_user_id", ] target_columns = [ "reply_engagement", "retweet_engagement", "retweet_with_comment_engagement", "like_engagement", ] for target_col in target_columns: print(f'============= {target_col} =============') # Get folds folds_col = [ "StratifiedGroupKFold_retweet_with_comment_engagement" ] assert len(folds_col) == 1, "The number of fold column must be one" folds = folds_train[folds_col] n_fold = folds.max().values[0] + 1 folds_ids = [] for i in range(n_fold): trn_idx = folds[folds != i].dropna().index val_idx = folds[folds == i].dropna().index folds_ids.append((trn_idx, val_idx)) print(f"{i+1}fold: n_trn={len(trn_idx)}, n_val={len(val_idx)}") for cat_col in category_columns: count = df_train_input[cat_col].value_counts() unseen_cat_list = count[count < THREASHOLD].index.tolist() df_train_input_cp = df_train_input.copy() df_train_input_cp.loc[ df_train_input_cp[cat_col].isin(unseen_cat_list), target_col] = np.nan print(target_col, cat_col, len(count), len(unseen_cat_list)) train_result, test_result = target_encoding( cat_col, df_train_input_cp, df_test_input, target_col, folds_ids) #df_train_input.drop(columns=[f"{cat_col}_ta"], inplace=True) #df_test_input.drop(columns=[f"{cat_col}_ta"], inplace=True) df_train_features[f"{target_col}__{cat_col}"] = train_result df_test_features[f"{target_col}__{cat_col}"] = test_result print(df_train_features.isnull().sum()) print(df_test_features.isnull().sum()) return df_train_features, df_test_features
def make_features(self, df_train_input, df_test_input): df_train_features = pd.DataFrame() df_test_features = pd.DataFrame() folds_train = self._download_from_gs( feather_file_name="StratifiedGroupKFold_training.ftr") category_columns = [ "language", "engaged_user_id", "engaging_user_id", ] target_columns = [ "reply_engagement", "retweet_engagement", "retweet_with_comment_engagement", "like_engagement", ] for target_col in target_columns: print(f'============= {target_col} =============') # Get folds folds_col = [ "StratifiedGroupKFold_retweet_with_comment_engagement" ] assert len(folds_col) == 1, "The number of fold column must be one" folds = folds_train[folds_col] n_fold = folds.max().values[0] + 1 folds_ids = [] for i in range(n_fold): trn_idx = folds[folds != i].dropna().index val_idx = folds[folds == i].dropna().index folds_ids.append((trn_idx, val_idx)) print(f"{i+1}fold: n_trn={len(trn_idx)}, n_val={len(val_idx)}") for cat_col in category_columns: train_result, test_result = target_encoding( cat_col, df_train_input, df_test_input, target_col, folds_ids) df_train_input.drop(columns=[f"{cat_col}_ta"], inplace=True) df_test_input.drop(columns=[f"{cat_col}_ta"], inplace=True) df_train_features[f"{target_col}__{cat_col}"] = train_result df_test_features[f"{target_col}__{cat_col}"] = test_result return df_train_features, df_test_features
def make_features(self, df_train_input, df_test_input): df_train_input = self._read_features_from_bigquery(self.train_table) df_test_input = self._read_features_from_bigquery(self.test_table) df_train_features = pd.DataFrame() df_test_features = pd.DataFrame() folds_train = self._download_from_gs( feather_file_name="TimeGroupKFold_training.ftr") category_columns = [ "engaging_user_id", ] target_columns = [ "diff_time", ] for target_col in target_columns: print(f'============= {target_col} =============') # Get folds folds_col = ["TimeGroupKFold_val_position"] assert len(folds_col) == 1, "The number of fold column must be one" folds = folds_train[folds_col] n_fold = folds.max().values[0] + 1 folds_ids = [] for i in range(n_fold): trn_idx = folds[folds != i].dropna().index val_idx = folds[folds == i].dropna().index folds_ids.append((trn_idx, val_idx)) print(f"{i+1}fold: n_trn={len(trn_idx)}, n_val={len(val_idx)}") for cat_col in category_columns: train_result, test_result = target_encoding( cat_col, df_train_input, df_test_input, target_col, folds_ids) df_train_input.drop(columns=[f"{cat_col}_ta"], inplace=True) df_test_input.drop(columns=[f"{cat_col}_ta"], inplace=True) df_train_features[f"{target_col}__{cat_col}"] = train_result df_test_features[f"{target_col}__{cat_col}"] = test_result print(df_train_features.isnull().sum()) print(df_test_features.isnull().sum()) return df_train_features, df_test_features
def make_features(self, df_train_input, df_test_input): train_data = self._read_inter_table_from_bigquery(self.train_table) test_data = self._read_inter_table_from_bigquery(self.test_table) train_data["engaging_user_id__hashtag"] = train_data[ "engaging_user_id"] + "_" + train_data["hashtag"] test_data["engaging_user_id__hashtag"] = test_data[ "engaging_user_id"] + "_" + test_data["hashtag"] df_train_features = pd.DataFrame() df_test_features = pd.DataFrame() folds_train = self._download_from_gs( feather_file_name="StratifiedGroupKFold_training.ftr") target_columns = [ "reply_engagement", "retweet_engagement", "retweet_with_comment_engagement", "like_engagement", ] category_column = "engaging_user_id__hashtag" target_encoding_column = f"{category_column}_ta" for target_col in target_columns: print(f'============= {target_col} =============') # Get folds folds_col = [ "StratifiedGroupKFold_retweet_with_comment_engagement" ] assert len(folds_col) == 1, "The number of fold column must be one" folds = folds_train[folds_col] n_fold = folds.max().values[0] + 1 folds_ids = [] folds_tweet_ids = [] for i in range(n_fold): trn_idx = folds[folds != i].dropna().index val_idx = folds[folds == i].dropna().index folds_ids.append((trn_idx, val_idx)) print(f"{i+1}fold: n_trn={len(trn_idx)}, n_val={len(val_idx)}") trn_tweet_id = df_train_input.iloc[trn_idx]["tweet_id"].unique( ) val_tweet_id = df_train_input.iloc[val_idx]["tweet_id"].unique( ) print( f"{i+1}fold: n_tweet_trn={len(trn_tweet_id)}, n_tweet_val={len(val_tweet_id)}" ) trn_tweet_idx = train_data.loc[train_data["tweet_id"].isin( trn_tweet_id)].index val_tweet_idx = train_data.loc[train_data["tweet_id"].isin( val_tweet_id)].index folds_tweet_ids.append((trn_tweet_idx, val_tweet_idx)) print( f"{i+1}fold: n_tweet_trn={len(trn_tweet_idx)}, n_tweet_trn={len(val_tweet_idx)}" ) _, _ = target_encoding(category_column, train_data, test_data, target_col, folds_tweet_ids) train_agg = train_data.groupby([ "tweet_id", "engaging_user_id" ])[target_encoding_column].agg(["min", "max", "mean"]).reset_index() test_agg = test_data.groupby([ "tweet_id", "engaging_user_id" ])[target_encoding_column].agg(["min", "max", "mean"]).reset_index() train_data.drop(columns=[target_encoding_column], inplace=True) test_data.drop(columns=[target_encoding_column], inplace=True) feature_names = ['min', 'max', 'mean'] for fe in feature_names: df_train_features[f"{target_col}_{fe}"] = pd.merge( df_train_input, train_agg, on=["tweet_id", "engaging_user_id"], how="left")[fe].values df_test_features[f"{target_col}_{fe}"] = pd.merge( df_test_input, test_agg, on=["tweet_id", "engaging_user_id"], how="left")[fe].values print(df_train_features.isnull().sum()) print(df_test_features.isnull().sum()) return df_train_features, df_test_features
def make_features(self, df_train_input, df_test_input): df_train_features = pd.DataFrame() df_test_features = pd.DataFrame() folds_train = self._download_from_gs( feather_file_name="StratifiedGroupKFold_training.ftr") category_columns = [ "engaged_user_id", ] target_columns = [ "like_engagement", ] # 1tweetあたりのengagementの合計値 df_train_input_tweet_id = df_train_input.groupby( ["tweet_id", "engaged_user_id"])[target_columns].sum().reset_index() df_test_input_tweet_id = df_test_input.groupby( ["tweet_id", "engaged_user_id"])[target_columns].sum().reset_index() for target_col in target_columns: print(f'============= {target_col} =============') # Get folds folds_col = [ "StratifiedGroupKFold_retweet_with_comment_engagement" ] assert len(folds_col) == 1, "The number of fold column must be one" folds = folds_train[folds_col] n_fold = folds.max().values[0] + 1 folds_ids = [] folds_tweet_ids = [] for i in range(n_fold): trn_idx = folds[folds != i].dropna().index val_idx = folds[folds == i].dropna().index folds_ids.append((trn_idx, val_idx)) print(f"{i+1}fold: n_trn={len(trn_idx)}, n_val={len(val_idx)}") trn_tweet_id = df_train_input.iloc[trn_idx]["tweet_id"].unique( ) val_tweet_id = df_train_input.iloc[val_idx]["tweet_id"].unique( ) print( f"{i+1}fold: n_tweet_trn={len(trn_tweet_id)}, n_tweet_val={len(val_tweet_id)}" ) trn_tweet_idx = df_train_input_tweet_id.loc[ df_train_input_tweet_id["tweet_id"].isin( trn_tweet_id)].index val_tweet_idx = df_train_input_tweet_id.loc[ df_train_input_tweet_id["tweet_id"].isin( val_tweet_id)].index folds_tweet_ids.append((trn_tweet_idx, val_tweet_idx)) print( f"{i+1}fold: n_tweet_trn={len(trn_tweet_idx)}, n_tweet_trn={len(val_tweet_idx)}" ) for cat_col in category_columns: # tweet_id単位のengagement総和による{cat_col}のtarget_encoding _, _ = target_encoding(cat_col, df_train_input_tweet_id, df_test_input_tweet_id, target_col, folds_tweet_ids) df_train_features[f"{target_col}__{cat_col}"] = (pd.merge( df_train_input, df_train_input_tweet_id[[ f"{cat_col}_ta", "tweet_id", cat_col ]], on=["tweet_id", cat_col], how="left"))[f"{cat_col}_ta"].values df_test_features[f"{target_col}__{cat_col}"] = (pd.merge( df_test_input, df_test_input_tweet_id[[ f"{cat_col}_ta", "tweet_id", cat_col ]], on=["tweet_id", cat_col], how="left"))[f"{cat_col}_ta"].values df_train_input_tweet_id.drop(columns=[f"{cat_col}_ta"], inplace=True) df_test_input_tweet_id.drop(columns=[f"{cat_col}_ta"], inplace=True) eps = 1e-2 df_train_features["like_follower_ratio"] = ( df_train_features["like_engagement__engaged_user_id"] / (df_train_input["engaged_follower_count"] + eps)) df_test_features["like_follower_ratio"] = ( df_test_features["like_engagement__engaged_user_id"] / (df_test_input["engaged_follower_count"] + eps)) print(df_train_features.isnull().sum()) print(df_test_features.isnull().sum()) return df_train_features, df_test_features