def fit(self, df): """Train the logistic regression model.""" df_impressions = fb.build_features(df) # Target column, item that was clicked f.print_time("target column") df_impressions.loc[:, "is_clicked"] = ( df_impressions["referenced_item"] == df_impressions["impressed_item"]).astype(int) features = [ "position", "prices", "interaction_count", "is_last_interacted", ] X = df_impressions[features] y = df_impressions.is_clicked # Training the actual model f.print_time("training logistic regression model") self.logreg = LogisticRegression(solver="lbfgs", max_iter=100, tol=1e-11, C=1e10).fit(X, y)
def fit(self, df): """Calculate item similarity based on session co-occurrence.""" # Select columns that are of interest for this method f.print_time("start") cols = [ 'user_id', 'session_id', 'timestamp', 'step', 'action_type', 'reference' ] df_cols = df.loc[:, cols] # We are only interested in action types, for wich the reference is an item ID f.print_time("filter interactions") item_interactions = [ 'clickout item', 'interaction item deals', 'interaction item image', 'interaction item info', 'interaction item rating', 'search for item' ] df_actions = ( df_cols.loc[df_cols.action_type.isin(item_interactions), :].rename( columns={ 'reference': 'item' }).drop(columns='action_type')) df_item_sim = nn.calc_item_sims(df_actions, "item", "session_id") self.df_item_sim = df_item_sim
def fit(self, df): """Calculate item similarity based on item metadata.""" # Explode property arrays f.print_time("explode properties") df_properties = dff.explode(df, "properties") df_item_sim = nn.calc_item_sims(df_properties, "item_id", "properties") self.df_item_sim = df_item_sim
def predict(self, df): """Sort the impression list by number of distinct users in the training phase.""" # Select columns that are of interest for this method f.print_time("start") cols = ['user_id', 'session_id', 'timestamp', 'step', 'action_type', 'reference', "impressions"] df_cols = df.loc[:, cols] # Target row, withheld item ID that needs to be predicted f.print_time("target rows") df_target = dff.get_target_rows(df_cols) # Explode to impression level f.print_time("explode impression array") df_impressions = ( dff.explode(df_target, "impressions") .rename(columns={"impressions": "impressed_item"}) ) df_impressions = ( df_impressions .merge( self.df_pop, left_on="impressed_item", right_on="item", how="left" ) ) # Summarize recommendations f.print_time("summarize recommendations") df_rec = dff.summarize_recs(df_impressions, "n_users") return df_rec
def predict(self, df): """Return items in impressions list in original order.""" # Target row, withheld item ID that needs to be predicted f.print_time("target rows") df_target = dff.get_target_rows(df.copy()) # Summarize recommendations f.print_time("summarize recommendations") df_target["item_recommendations"] = (df_target.apply( lambda x: x.impressions.replace("|", " "), axis=1)) cols_rec = [ "user_id", "session_id", "timestamp", "step", "item_recommendations" ] df_rec = df_target.loc[:, cols_rec] return df_rec
def predict(self, df): """Calculate item ranking based on trained lightGBM model.""" df_impressions = fb.build_features(df) # Target row, withheld item ID that needs to be predicted df_impressions = df_impressions[df_impressions.referenced_item.isna()] features = [ "position", "prices", "interaction_count", "is_last_interacted" ] df_impressions.loc[:, "click_propensity"] = self.gbm.predict( df_impressions[features]) # Summarize recommendations f.print_time("summarize recommendations") df_rec = dff.summarize_recs(df_impressions, "click_propensity") return df_rec
def fit(self, df): """Count the number of clicks for each item.""" # Select columns that are of interest for this method f.print_time("start") cols = ['user_id', 'session_id', 'timestamp', 'step', 'action_type', 'reference'] df_cols = df.loc[:, cols] # We only need to count clickouts per item f.print_time("clicks per item") df_item_clicks = ( df_cols .loc[df_cols["action_type"] == "clickout item", :] .groupby("reference") .size() .reset_index(name="n_clicks") .rename(columns={"reference": "item"}) ) self.df_pop = df_item_clicks
def fit(self, df): """Train the lightGBM model.""" df_impressions = fb.build_features(df) # Target column, item that was clicked f.print_time("target column") df_impressions.loc[:, "is_clicked"] = ( df_impressions["referenced_item"] == df_impressions["impressed_item"]).astype(int) features = [ "position", "prices", "interaction_count", "is_last_interacted", ] # Bring to format suitable for lightGBM f.print_time("lightGBM format") X = df_impressions[features] y = df_impressions.is_clicked q = (df_impressions.groupby( ["user_id", "session_id", "timestamp", "step"]).size().reset_index(name="query_length").query_length) # Training the actual model f.print_time("training lightGBM model") self.gbm = lgb.LGBMRanker() self.gbm.fit(X, y, group=q, verbose=True)
def predict(self, df): """Calculate click probability based on trained logistic regression model.""" df_impressions = fb.build_features(df) # Target row, withheld item ID that needs to be predicted df_impressions = df_impressions[df_impressions.referenced_item.isna()] features = [ "position", "prices", "interaction_count", "is_last_interacted" ] # Predict clickout probabilities for each impressed item f.print_time("predict clickout item") df_impressions.loc[:, "click_probability"] = (self.logreg.predict_proba( df_impressions[features])[:, 1]) # Summarize recommendations f.print_time("summarize recommendations") df_rec = dff.summarize_recs(df_impressions, "click_probability") return df_rec
def predict(self, df): """Randomly sort the impressions list.""" # Target row, withheld item ID that needs to be predicted f.print_time("target rows") df_target = dff.get_target_rows(df.copy()) # Summarize recommendations f.print_time("summarize recommendations") random.seed(10121) df_target.loc[:, "item_recs_list"] = ( df_target.loc[:, "impressions"].str.split("|").map( lambda x: sorted(x, key=lambda k: random.random()))) df_target.loc[:, "item_recommendations"] = ( df_target["item_recs_list"].map(lambda arr: ' '.join(arr))) cols_rec = [ "user_id", "session_id", "timestamp", "step", "item_recommendations" ] df_rec = df_target.loc[:, cols_rec] return df_rec
def calc_item_sims(df, item_col, reference_col): """Calculate similarity of items based on nearest neighbor algorithm. The final data frame will have similarity scores for pairs of items. :param df: Data frame of training data :param item_col: Name of data frame column that contains the item ID :param reference_col: Name of the reference column, depending on the model either 1. session_id for the similarity based on session co-occurrences 2. properties for the similarity based on item metadata :return: Data frame with item pairs and similarity scores """ # Create data frame with item and reference indices f.print_time("item and reference indices") unique_items = df[item_col].unique() unique_refs = df[reference_col].unique() d_items = {item_col: unique_items, 'item_idx': range(0, len(unique_items))} d_refs = { reference_col: unique_refs, 'ref_idx': range(0, len(unique_refs)) } df_items = pd.DataFrame(data=d_items) df_refs = pd.DataFrame(data=d_refs) df = (df.merge(df_items, how="inner", on=item_col).merge(df_refs, how="inner", on=reference_col)) df_idx = (df.loc[:, ["item_idx", "ref_idx"]].assign( data=lambda x: 1.).drop_duplicates()) # Build item co-ooccurrence matrix f.print_time("item co-occurrence matrix") mat_coo = sparse.coo_matrix( (df_idx.data, (df_idx.item_idx, df_idx.ref_idx))) mat_item_coo = mat_coo.T.dot(mat_coo) # Calculate Cosine similarities f.print_time("Cosine similarity") inv_occ = np.sqrt(1 / mat_item_coo.diagonal()) cosine_sim = mat_item_coo.multiply(inv_occ) cosine_sim = cosine_sim.T.multiply(inv_occ) # Create item similarity data frame f.print_time("item similarity data frame") idx_ref, idx_item, sim = sparse.find(cosine_sim) d_item_sim = {'idx_ref': idx_ref, 'idx_item': idx_item, 'similarity': sim} df_item_sim = pd.DataFrame(data=d_item_sim) df_item_sim = (df_item_sim.merge( df_items.assign(item_ref=df_items[item_col]), how="inner", left_on="idx_ref", right_on="item_idx").merge( df_items.assign(item_sim=df_items[item_col]), how="inner", left_on="idx_item", right_on="item_idx").loc[:, ["item_ref", "item_sim", "similarity"]]) return df_item_sim
def predict_nn(df, df_item_sim): """Calculate predictions based on the item similarity scores.""" # Select columns that are of interest for this function f.print_time("start") cols = [ 'user_id', 'session_id', 'timestamp', 'step', 'action_type', 'reference', 'impressions' ] df_cols = df.loc[:, cols] # Get previous reference per user f.print_time("previous reference") df_cols["previous_reference"] = (df_cols.sort_values( by=["user_id", "session_id", "timestamp"], ascending=[True, True, True]).groupby(["user_id"])["reference"].shift(1)) # Target row, withheld item ID that needs to be predicted f.print_time("target rows") df_target = dff.get_target_rows(df_cols) # Explode to impression level f.print_time("explode impression array") df_impressions = dff.explode(df_target, "impressions") df_item_sim["item_ref"] = df_item_sim["item_ref"].astype(str) df_item_sim["item_sim"] = df_item_sim["item_sim"].astype(str) # Get similarities f.print_time("get similarities") df_impressions = (df_impressions.merge( df_item_sim, how="left", left_on=["previous_reference", "impressions"], right_on=["item_ref", "item_sim"]).fillna(value={ 'similarity': 0 }).sort_values(by=["user_id", "timestamp", "step", "similarity"], ascending=[True, True, True, False])) # Summarize recommendations f.print_time("summarize recommendations") df_rec = dff.group_concat(df_impressions, ["user_id", "session_id", "timestamp", "step"], "impressions") df_rec = (df_rec.rename(columns={ 'impressions': 'item_recommendations' }).loc[:, [ "user_id", "session_id", "timestamp", "step", "item_recommendations" ]]) return df_rec
def build_features(df): """Build features for the lightGBM and logistic regression model.""" # Select columns that are of interest for this method f.print_time("start") cols = [ 'user_id', 'session_id', 'timestamp', 'step', 'action_type', 'reference', 'impressions', 'prices' ] df_cols = df.loc[:, cols] # We are only interested in action types, for wich the reference is an item ID f.print_time("filter interactions") item_interactions = [ 'clickout item', 'interaction item deals', 'interaction item image', 'interaction item info', 'interaction item rating', 'search for item' ] df_actions = (df_cols.loc[df_cols.action_type.isin(item_interactions), :]. copy().rename(columns={'reference': 'referenced_item'})) f.print_time("cleaning") # Clean of instances that have no reference idx_rm = (df_actions.action_type != "clickout item") & (df_actions.referenced_item.isna()) df_actions = df_actions[~idx_rm] # Get item ID of previous interaction of a user in a session f.print_time("previous interactions") df_actions.loc[:, "previous_item"] = (df_actions.sort_values( by=["user_id", "session_id", "timestamp", "step"], ascending=[True, True, True, True]).groupby(["user_id"])["referenced_item"].shift(1)) # Combine the impressions and item column, they both contain item IDs # and we can expand the impression lists in the next step to get the total # interaction count for an item f.print_time("combining columns - impressions") df_actions.loc[:, "interacted_item"] = np.where(df_actions.impressions.isna(), df_actions.referenced_item, df_actions.impressions) df_actions = df_actions.drop(columns="impressions") # Price array expansion will get easier without NAs f.print_time("combining columns - prices") df_actions.loc[:, "prices"] = np.where(df_actions.prices.isna(), "", df_actions.prices) # Convert pipe separated lists into columns f.print_time("explode arrays") df_items = dff.explode_mult(df_actions, ["interacted_item", "prices"]).copy() # Feature: Number of previous interactions with an item f.print_time("interaction count") df_items.loc[:, "interaction_count"] = (df_items.groupby( ["user_id", "interacted_item"]).cumcount()) # Reduce to impression level again f.print_time("reduce to impressions") df_impressions = ( df_items[df_items.action_type == "clickout item"].copy().drop( columns="action_type").rename( columns={"interacted_item": "impressed_item"})) # Feature: Position of item in the original list. # Items are in original order after the explode for each index f.print_time("position feature") df_impressions.loc[:, "position"] = (df_impressions.groupby( ["user_id", "session_id", "timestamp", "step"]).cumcount() + 1) # Feature: Is the impressed item the last interacted item f.print_time("last interacted item feature") df_impressions.loc[:, "is_last_interacted"] = ( df_impressions["previous_item"] == df_impressions["impressed_item"] ).astype(int) f.print_time("change price datatype") df_impressions.loc[:, "prices"] = df_impressions.prices.astype(int) return_cols = [ "user_id", "session_id", "timestamp", "step", "position", "prices", "interaction_count", "is_last_interacted", "referenced_item", "impressed_item", ] df_return = df_impressions[return_cols] return df_return