def make_order_interval_features(orders: pd.DataFrame) -> pd.DataFrame: orders = orders.sort_values(['client_id', 'datetime']) last_order_client = orders['client_id'].shift(1) is_same_client = last_order_client == orders['client_id'] orders['last_order_datetime'] = orders['datetime'].shift(1) orders['orders_interval'] = np.nan orders.loc[is_same_client, 'orders_interval'] = ( orders.loc[is_same_client, 'datetime'] - orders.loc[is_same_client, 'last_order_datetime'] ).dt.total_seconds() / SECONDS_IN_DAY cl_gb = orders.groupby('client_id', sort=False) features = cl_gb.agg( { 'orders_interval': [ 'mean', # mean interval between orders 'median', 'std', # constancy of orders 'min', 'max', 'last', # interval between last 2 orders ] } ) drop_column_multi_index_inplace(features) features.reset_index(inplace=True) features.fillna(-3, inplace=True) return features
def make_store_features(orders: pd.DataFrame) -> pd.DataFrame: cl_st_gb = orders.groupby(['client_id', 'store_id']) store_agg = cl_st_gb.agg({ 'transaction_id': ['count'], }) drop_column_multi_index_inplace(store_agg) store_agg.reset_index(inplace=True) cl_gb = store_agg.groupby(['client_id']) simple_features = cl_gb.agg( { 'transaction_id_count': ['max', 'mean', 'median'] } ) drop_column_multi_index_inplace(simple_features) simple_features.reset_index(inplace=True) simple_features.columns = ( ['client_id'] + [ f'store_{col}' for col in simple_features.columns[1:] ] ) latent_features = make_latent_store_features(orders) features = pd.merge( simple_features, latent_features, on='client_id' ) return features
def make_small_product_features(purchases: pd.DataFrame) -> pd.DataFrame: cl_pr_gb = purchases.groupby(['client_id', 'product_id']) product_agg = cl_pr_gb.agg({ 'product_quantity': ['sum'], }) drop_column_multi_index_inplace(product_agg) product_agg.reset_index(inplace=True) cl_gb = product_agg.groupby(['client_id']) features = cl_gb.agg({'product_quantity_sum': ['max']}) drop_column_multi_index_inplace(features) features.reset_index(inplace=True) return features
def make_features_for_orders_with_express_points_spent( orders: pd.DataFrame ) -> pd.DataFrame: orders_with_eps = orders.loc[orders['express_points_spent'] != 0] o_gb = orders_with_eps.groupby(['client_id']) features = o_gb.agg( { 'purchase_sum': ['median'], 'datetime': ['max'] } ) drop_column_multi_index_inplace(features) features.reset_index(inplace=True) features['days_from_last_express_points_spent'] = ( MAILING_DATETIME - features['datetime_max'] ).dt.days features.drop(columns=['datetime_max'], inplace=True) features.rename( columns={ 'purchase_sum_median': 'median_purchase_sum_eps' }, inplace=True, ) order_int_features = make_order_interval_features(orders_with_eps) renamings = { col: f'{col}_eps' for col in order_int_features if col != 'client_id' } order_int_features.rename(columns=renamings, inplace=True) features = pd.merge( features, order_int_features, on='client_id', ) features = features.merge( pd.Series(orders['client_id'].unique(), name='client_id'), how='right', ) return features
def make_really_purchase_features(purchases: pd.DataFrame) -> pd.DataFrame: simple_purchases = purchases.reindex( columns=['client_id', 'product_id', 'trn_sum_from_iss'] ) prices_bounds = [0, 98, 195, 490, 950, 1900, 4400, FLOAT32_MAX] agg_dict = {} for i, lower_bound in enumerate(prices_bounds[:-1]): upper_bound = prices_bounds[i + 1] name = f'price_from_{lower_bound}' simple_purchases[name] = ( (simple_purchases['trn_sum_from_iss'] >= lower_bound) & (simple_purchases['trn_sum_from_iss'] < upper_bound) ).astype(int) agg_dict[name] = ['sum', 'mean'] agg_dict.update( { 'trn_sum_from_iss': ['median'], # median product price 'product_id': ['count', 'nunique'], } ) simple_features = simple_purchases.groupby('client_id').agg(agg_dict) drop_column_multi_index_inplace(simple_features) simple_features.reset_index(inplace=True) p_gb = purchases.groupby(['client_id', 'transaction_id']) purchase_agg = p_gb.agg( { 'product_id': ['count'], 'product_quantity': ['max'], } ) drop_column_multi_index_inplace(purchase_agg) purchase_agg.reset_index(inplace=True) o_gb = purchase_agg.groupby('client_id') complex_features = o_gb.agg( { # mean products in order 'product_id_count': ['mean', 'median'], # mean max number of one product 'product_quantity_max': ['mean', 'median'], } ) drop_column_multi_index_inplace(complex_features) complex_features.reset_index(inplace=True) features = pd.merge( simple_features, complex_features, on='client_id' ) return features
def make_order_features(orders: pd.DataFrame) -> pd.DataFrame: orders = orders.copy() o_gb = orders.groupby('client_id') agg_dict = { 'transaction_id': ['count'], # number of orders 'regular_points_received': ['sum', 'max', 'median'], 'express_points_received': ['sum', 'max', 'median'], 'regular_points_spent': ['sum', 'min', 'median'], 'express_points_spent': ['sum', 'min', 'median'], 'purchase_sum': ['sum', 'max', 'median'], 'store_id': ['nunique'], # number of unique stores 'datetime': ['max'], # datetime of last order } # is regular/express points spent/received for points_type in POINT_TYPES: for event_type in POINT_EVENT_TYPES: col_name = f'{points_type}_points_{event_type}' new_col_name = f'is_{points_type}_points_{event_type}' orders[new_col_name] = (orders[col_name] != 0).astype(int) agg_dict[new_col_name] = ['sum'] features = o_gb.agg(agg_dict) drop_column_multi_index_inplace(features) features.reset_index(inplace=True) features['days_from_last_order'] = ( MAILING_DATETIME - features['datetime_max'] ).dt.total_seconds() // SECONDS_IN_DAY features.drop(columns=['datetime_max'], inplace=True) # proportion of regular/express points spent to all transactions for points_type in POINT_TYPES: for event_type in POINT_EVENT_TYPES: col_name = f'is_{points_type}_points_{event_type}_sum' new_col_name = f'proportion_count_{points_type}_points_{event_type}' features[new_col_name] = ( features[col_name] / features['transaction_id_count'] ) express_col = f'is_express_points_spent_sum' regular_col = f'is_regular_points_spent_sum' new_col_name = f'ratio_count_express_to_regular_points_spent' features[new_col_name] = ( features[express_col] / features[regular_col] ).replace(np.inf, FLOAT32_MAX) for points_type in POINT_TYPES: spent_col = f'is_{points_type}_points_spent_sum' received_col = f'is_{points_type}_points_received_sum' new_col_name = f'ratio_count_{points_type}_points_spent_to_received' features[new_col_name] = ( features[spent_col] / features[received_col] ).replace(np.inf, 1000) for points_type in POINT_TYPES: spent_col = f'{points_type}_points_spent_sum' orders_sum_col = f'purchase_sum_sum' new_col_name = f'ratio_sum_{points_type}_points_spent_to_purchases_sum' features[new_col_name] = features[spent_col] / features[orders_sum_col] new_col_name = f'ratio_sum_express_points_spent_to_sum_regular_points_spent' regular_col = f'regular_points_spent_sum' express_col = f'express_points_spent_sum' features[new_col_name] = features[express_col] / features[regular_col] return features