def _fit(self, mode): """ has target indices equal to the 10% of the session with: - no num ref - more than 1 step but anyway we train on all of them ;) """ def RepresentsInt(s): try: int(s) return True except ValueError: return False train = data.train_df('small') test = data.test_df('small') tgt_indices = data.target_indices('small') real_test_to_keep = [] for idx in tgt_indices: usr_sess_indices = [] theres_int = False a_user = test.at[idx, 'user_id'] a_sess = test.at[idx, 'session_id'] usr_sess_indices.append(idx) j = idx-1 pos_moved = 0 while j >= 0: try: new_user = test.at[j, 'user_id'] new_sess = test.at[j, 'session_id'] if new_user == a_user and new_sess == a_sess: usr_sess_indices.append(j) reference = test.at[j, 'reference'] if RepresentsInt(reference): theres_int = True j -= 1 pos_moved += 1 else: if not (pos_moved == 0 or theres_int): real_test_to_keep += sorted(usr_sess_indices) break except: if j < test.index.values[0]: if not (pos_moved == 0 or theres_int): real_test_to_keep += sorted(usr_sess_indices) break else: j -= 1 self.train_indices = train.index.values real_test_indices = retrieve_real_test_indices(mode, 'no_cluster') all_test_indices = data.test_df(mode).index.values self.test_indices = sorted(list(set(all_test_indices) - set(real_test_indices)) + real_test_to_keep) self.target_indices = sorted(list(set(self.test_indices) & set(tgt_indices)))
def fit(self): df_test = data.test_df(self.mode, cluster=self.cluster) print("{}: creating grouped sessions with interaction lists".format(self.name)) session_groups = self.get_groupby_sessions_references(data.test_df(self.mode, cluster=self.cluster)) # Getting target sessions target_indices = data.target_indices(self.mode, self.cluster) df_test_target = df_test[df_test.index.isin(target_indices)] # I must reason with session_ids since i'm interested in getting last interactions of same session df_test_target = df_test_target.set_index("session_id") # then i create a dictionary for re-mapping session into indices if len(df_test_target.index) != len(target_indices): print("Indices not same lenght of sessions, go get some coffee...") return self.dictionary_indices = dict(zip(df_test_target.index, target_indices)) list_sessions = session_groups.index recs_tuples = [] print("{}: fitting the model".format(self.name)) for i in tqdm(df_test_target.index): # Check if it is a session without interactions if i not in list_sessions: recs_tuples.append((self.dictionary_indices.get(i), [])) else: # Get interacted element of session with no duplicates interacted_elements = np.asarray(session_groups.at[i, "sequence"]) interacted_elements = np.asarray(self._set_no_reordering(x for x in interacted_elements)) impressions = np.asarray(df_test_target.at[i, "impressions"].split("|")) # First i want to be sure the impressions contains all the interacted elements (if not, they must be cutted off from relevant items) mask_only_in_impression = np.in1d(interacted_elements, impressions, assume_unique=True) interacted_elements = interacted_elements[mask_only_in_impression] # I append the last interacted elements as first (so I invert the order of relevant_elements!) real_recommended = np.flipud(interacted_elements) real_recommended = real_recommended.astype(np.int) recs_tuples.append( (self.dictionary_indices.get(i), list(real_recommended)[:self.k_first_only_to_recommend])) self.recs_batch = recs_tuples
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) temp = df[df.action_type == "clickout item"] temp = temp.drop_duplicates("session_id", keep="last") temp = temp[["user_id", "session_id", "step", "impressions"]] actions = list() for index, row in tqdm(temp.iterrows(), desc="Scanning clickouts"): if index > 0: if int(row.step) == 1: actions.append(0) else: impression = list(map(int, row.impressions.split("|")))[0] reference = df.loc[index - 1, "reference"] if (type(reference) == str) and (reference.isdigit()) and (int(reference) == impression): actions.append(1) else: actions.append(0) else: actions.append(0) temp = temp.drop(["step", "impressions"], axis=1) temp["last_action_involving_first_impression"] = actions return temp
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # remove last clks and last part of session new_df = remove_last_part_of_clk_sessions(df) new_df = new_df.drop(find(new_df)) no_last_clks_numeric = new_df[ new_df.reference.str.isnumeric() == True][[ 'user_id', 'session_id', 'action_type', 'reference' ]] # we want to make it fast, avoid any loops... # simply drop duplicates and mantain last occurence # of the tuple user-session-item :D last_actions = no_last_clks_numeric.drop_duplicates( ['user_id', 'session_id', 'reference'], keep='last') last_actions = last_actions.rename( columns={ 'reference': 'item_id', 'action_type': 'last_action_involving_impression' }) last_actions.item_id = last_actions.item_id.astype(int) # get last clickouts and expand last_clk = df.loc[find(df)] clk_expanded = expand_impressions(last_clk)[[ 'user_id', 'session_id', 'item_id' ]] # now simply merge and fill NaNs with 'no_action' as in the original feature feature = pd.merge(clk_expanded, last_actions, how='left', on=['user_id', 'session_id', 'item_id']) feature.last_action_involving_impression = feature.last_action_involving_impression.astype( object).fillna('no_action') return feature
def _fit(self, mode): self.train_indices = data.train_df(mode).index.values df = data.test_df(mode) self.test_indices = df.index.values just_missing_refs = df[df['reference'].isnull()] just_missing_refs = just_missing_refs[just_missing_refs['action_type'] == 'clickout item'] idx_last_ref_numeric = [] for idx, row in just_missing_refs.iterrows(): sess = row['session_id'] i = 1 while True: if not self.existsIndex(df, idx - i): break prev_row = df.loc[idx - i] if prev_row['session_id'] != sess: break if self.RepresentsInt(prev_row['reference']): if i == 1: idx_last_ref_numeric.append(idx) break else: break else: i += 1 self.target_indices = idx_last_ref_numeric
def fit(self): """ Create list of tuples for recommendations ordering them by impressions """ df_test = data.test_df(self.mode) target_indices = data.target_indices(self.mode, self.cluster) df_test_target = df_test[df_test.index.isin(target_indices)] # Initializing list of recs recs_tuples = [] for i in tqdm(df_test_target.index): impressions = df_test_target.at[i, "impressions"] impressions = list(map(int, impressions.split('|'))) prices = df_test_target.at[i, "prices"] prices = list(map(int, prices.split('|'))) temp_dict = {} for j in range(len(impressions)): temp_dict[impressions[j]] = int(prices[j]) ordered_recs = sorted(temp_dict, key=temp_dict.__getitem__) recs_tuples.append((i, ordered_recs)) self.recs_batch = recs_tuples return recs_tuples
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) target_indices = list( df[df.action_type == "clickout item"].drop_duplicates( "session_id", keep="last").index) temp = df.loc[target_indices] avg_prices = list() min_prices = list() max_prices = list() for t in tqdm(zip(temp["session_id"], temp["prices"]), desc="Retreiving first impression's price per clickout"): prices = list(map(int, t[1].split("|"))) prices = prices[1:] if len(prices) > 0: avg_prices.append(mean(prices)) min_prices.append(min(prices)) max_prices.append(max(prices)) else: avg_prices.append(0) min_prices.append(0) max_prices.append(0) if len(target_indices) != len(avg_prices): print("Something went wrong, blame Piccio") exit(69) temp = df.loc[target_indices, ["user_id", "session_id"]] temp["avg_prices_other_impressions"] = avg_prices temp["min_price"] = min_prices temp["max_price"] = max_prices return temp
def extract_feature(self): self.current_directory = Path(__file__).absolute().parent self.data_dir = self.current_directory.joinpath( '..', '..', 'stacking', self.mode) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_indices = find(df) # extract scores self.train_dir = self.data_dir.joinpath('test') for file in glob.glob(str(self.train_dir) + '/rnn*'): rnn = np.load(file) rnn = pd.DataFrame( rnn, columns=['index', 'item_recommendations', 'scores']) rnn = rnn.astype({'index': int}) rnn = rnn[rnn['index'].isin(last_indices)] rnn_idx = list(rnn['index']) print(f'Rnn indices are : {len(set(rnn_idx))}') print(f'Last indices are : {len((last_indices))}') common = set(rnn_idx) & set(last_indices) print(f'In common : {len(common)}') t = assign_score(rnn, 'rnn') t = t.sort_values(by='index') df['index'] = df.index.values df = df[['user_id', 'session_id', 'index']] df = pd.merge(t, df, how='left', on=['index']) num_idx = len(set(df['index'].values)) print(num_idx) return df[['user_id', 'session_id', 'item_id', 'score_rnn']]
def extract_feature(self): feature = TopPopPerImpression(mode=self.mode, cluster=self.cluster).read_feature() items = dict() for t in tqdm(zip(feature.item_id, feature.top_pop_per_impression), desc="Creating item dict..."): items[t[0]] = t[1] train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) target_indices = list(df[df.action_type == "clickout item"].drop_duplicates("session_id", keep="last").index) temp = df[df.index.isin(target_indices)] first_pop = list() max_pop_in_impressions = list() for t in tqdm(temp.impressions): impressions = list(map(int, t.split("|"))) fi = impressions[0] if fi in items: fi_pop = items[fi] else: fi_pop = 0 first_pop.append(fi_pop) max_pop = fi_pop for i in impressions[1:]: if i in items: pop = items[i] max_pop = max(pop, max_pop) max_pop_in_impressions.append(max_pop) temp = temp[["user_id", "session_id"]] temp["pop_first_impression"] = first_pop temp["max_pop_in_impressions"] = max_pop_in_impressions return temp
def extract_feature(self): def func(x): change_of_sort_order_actions = x[x['action_type'] == 'change of sort order'] if len(change_of_sort_order_actions) > 0: y = x[(x['action_type'] == 'clickout item')] if len(y) > 0: clk = y.tail(1) head_index = x.head(1).index x = x.loc[head_index.values[0]:clk.index.values[0] - 1] change_of_sort_order_actions = x[x['action_type'] == 'change of sort order'] if len(change_of_sort_order_actions) > 0: change_of_sort_order_actions = change_of_sort_order_actions.tail( 1) return change_of_sort_order_actions[ 'reference'].values[0] return 'our recommendations' train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) s = df.groupby(['user_id', 'session_id']).progress_apply(func) return pd.DataFrame({ 'user_id': [x[0] for x in s.index.values], 'session_id': [x[1] for x in s.index.values], 'sort_order_active_when_clickout': s.values })
def supersampling(mode): print("Supersampling for mode: {}".format(mode)) train = data.train_df(mode) class_to_sessions = get_class_to_sessions_dict(train) session_to_indices = get_session_to_indices_dict(train) sessions_to_be_resapmled = resample_session(class_to_sessions.copy(), train) new = duplicate_sessions(sessions_to_be_resapmled.copy(), train, session_to_indices) test = data.test_df(mode) max_test_index = max(test.index) max_train_index = max(train.index) max_index = max(max_test_index, max_train_index) new.index += max_index + 1 new = pd.concat([train, new]) train_len = len(new) old_starting_index = test.index[0] new = pd.concat([new, test]) print("Supersampling ended for mode={}, saving df".format(mode)) new_train = new.iloc[:train_len - 1] new_test = new.iloc[train_len:] # new_starting_index = new_test.index[0] # offset = new_starting_index - old_starting_index # target_indices += offset target_indices = data.target_indices(mode, "no_cluster") np.save(path + "/" + mode + "/target_indices", target_indices) new_train.to_csv(path + "/" + mode + "/train.csv", index=True) new_test.to_csv(path + "/" + mode + "/test.csv", index=True)
def preprocess_cv(mode='full'): def save_folds(df, user_session_df, train_index, test_index, count, mode): u_s_train = list( user_session_df.loc[train_index]['user_session'].values) u_s_test = list(user_session_df.loc[test_index]['user_session'].values) path = 'dataset/preprocessed/{}/{}'.format('fold_' + str(count), mode) check_folder(path) train = df[df['user_session'].isin(u_s_train)] train = train.drop(['user_session'], axis=1) train.to_csv(os.path.join(path, 'train.csv')) train_indices = train.index.values np.save(os.path.join(path, 'train_indices'), train_indices) test = df[df['user_session'].isin(u_s_test)] target_indices = sorted(find(test)) test.at[target_indices, 'reference'] = np.nan test = test.drop(['user_session'], axis=1) test.to_csv(os.path.join(path, 'test.csv')) test_indices = test.index.values np.save(os.path.join(path, 'test_indices'), test_indices) np.save(os.path.join(path, 'target_indices'), target_indices) print(f'Train shape : {train.shape} , Test shape : {test.shape}') print(f'Last clickout indices : {len(target_indices)}') train_df = data.train_df(mode=mode, cluster='no_cluster') train_df['user_session'] = train_df['user_id'].values + '_' + train_df[ 'session_id'].values test_df = data.test_df(mode=mode, cluster='no_cluster') test_df['user_session'] = test_df['user_id'].values + '_' + test_df[ 'session_id'].values df = pd.concat([train_df, test_df]) # extract user_session referring to target_indices target_indices = data.target_indices(mode=mode, cluster='no_cluster') test_target_u_s = test_df.loc[target_indices].drop_duplicates( 'user_session')['user_session'].to_list() print(f'Number of user_session in target_indices : {len(test_target_u_s)}') # remove those sessions from df df = df[~df['user_session'].isin(test_target_u_s)] #df['user_session'] = df['user_id'].values + '_' + df['session_id'].values user_session_df = df.drop_duplicates('user_session') user_session_df = user_session_df.reset_index(drop=True) print( f'Number of user_session NOT in target_indices : {user_session_df.shape[0]}' ) kf = KFold(n_splits=5, shuffle=True, random_state=42) for i, (train_index, test_index) in enumerate(kf.split(user_session_df)): print( f' train indices : {len(train_index)}, test indices : {len(test_index)}' ) save_folds(df, user_session_df, train_index, test_index, i, mode)
def extract_feature(self): def func(x): def last_important_steps(x): y = x[x.action_type == 'filter selection'].tail(1) i = x[x.action_type == 'search for item'].tail(1) d = x[x.action_type == 'search for destination'].tail(1) p = x[x.action_type == 'search for poi'].tail(1) steps = [y, i, d, p] _from = 1 _from_serie = x.head(1) for i in steps: if i.step.empty != True: if i.step.values[0] > _from: _from = i.step.values[0] _from_serie = i return pd.Series({'session_length_timestamp': int(x.tail(1)['timestamp'].values[0]) - int(_from_serie['timestamp'].values[0]), 'session_length_step': int(x.tail(1).step) - int(_from) + 1}) _important_steps = x.groupby(['user_id', 'session_id']).progress_apply(last_important_steps) return pd.DataFrame(_important_steps).reset_index() train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) s = func(df) return s
def _fit(self, mode): """ Cluster and predict for the test sessions without any numerical reference interactions self.train_indices: will contain all the train interactions self.test_indices: will contain all the test interactions self.target_indices: will contain the test interactions of sessions without any other numerical reference interaction """ # use only train of only cluster train = data.train_df(mode) train_groups = train.groupby(['session_id', 'user_id'], as_index=False).progress_apply( self.func_remove_steps_over_clk) self.train_indices = [x[1] for x in train_groups.index.values] # Those are groups of train I need, now let's keep only last clickout as part of the session test = data.test_df(mode) test_df = test.groupby(['session_id', 'user_id']) test_df = test_df.progress_apply(self.func_remove_steps_over_clk_test) if test_df.shape[0] > 0: self.target_indices = test_df[test_df.action_type == 'clickout item'].index.values # test_df has only those indices belonging to desired sessions cluster self.test_indices = list(list(zip(*test_df.index.values))[2])
def create_weights_position(train_df, mode,cluster): train = data.train_df(mode, cluster) test = data.test_df(mode, cluster) df = pd.concat([train, test]) # get for each user-session the position of the clicked item df_clks = df[(df['reference'].str.isnumeric()==True)&(df['action_type']=='clickout item')][['user_id','session_id','reference','impressions']] df_clks.impressions = df_clks.impressions.str.split('|') new_col = [] for t in tqdm(zip(df_clks.reference, df_clks.impressions)): if t[0] in t[1]: new_col.append(t[1].index(t[0])+1) else: new_col.append(-1) df_clks['pos_clicked'] = new_col pos_clicked_list = df_clks.pos_clicked.tolist() # create dictionary {pos:score} dict_pos_score = {} for i in tqdm(range(1,26)): dict_pos_score[i] = 1-(pos_clicked_list.count(i)/len(pos_clicked_list)) # the function is 1-(#pos/tot_rowså) # group per user-session group = train_df.drop_duplicates(['user_id','session_id'])[['user_id','session_id']].reset_index(drop=True) # assign weight gr = train_df[train_df.label==1][['user_id','session_id','impression_position']] new_col = [] for p in gr.impression_position: if p not in range(1,26): new_col.append(0) else: new_col.append(dict_pos_score[p]) gr['weight'] = new_col final = pd.merge(group, gr, how='left', on=['user_id','session_id']).fillna(0) sample_weights = final['weight'].values return sample_weights
def extract_feature(self): o = ImpressionFeature(self.mode) f = o.read_feature() f = f.drop(['properties'], axis=1) f['popularity'] = 0 pop = dict(zip(f.item_id.values, f.popularity.values)) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) df_dropped_last_clickouts = df.drop(last_clickout_indices) df_no_last_clickouts = df_dropped_last_clickouts[ (df_dropped_last_clickouts.action_type == 'clickout item') & ~(df_dropped_last_clickouts.reference.isnull())] references = df_no_last_clickouts.reference.values for r in references: pop[int(r)] += 1 final_df = pd.DataFrame( list(pop.items()), columns=['item_id', 'top_pop_interaction_clickout_per_impression']) return final_df
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','action_type','impressions']] reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')] df_item_clicks = ( reference_rows .groupby(["reference"]) .size() .reset_index(name="n_interactions_per_item") ) df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) #df_item_clicks clk_expanded = expand_impressions(clickout_rows) final_feature = pd.merge(clk_expanded, df_item_clicks, how='left', on=['item_id']).fillna(0) final_feature.n_interactions_per_item = final_feature.n_interactions_per_item.astype(int) final_feature = final_feature.drop(['index'], axis=1) final_feature.reference = final_feature.reference.astype(int) new_column = [] for t in zip(final_feature.item_id, final_feature.reference, final_feature.n_interactions_per_item): if t[0] == t[1]: new_column.append(int(t[2]-1)) else: new_column.append(int(t[2])) final_feature['personalized_popularity'] = new_column final_feature_reduced = final_feature[['user_id','session_id','item_id','personalized_popularity']] return final_feature_reduced
def recommend_batch(self): svm_filename = 'svmlight_test.txt' _path = self.data_dir.joinpath(svm_filename) X_test, y_test = load_svmlight_file(str(_path)) target_indices = data.target_indices(self.mode, self.cluster) target_indices.sort() test = data.test_df('small', 'no_cluster') print('data for test ready') scores = list(self.model.predict(X_test)) final_predictions = [] count = 0 for index in tqdm(target_indices): impressions = list(map(int, test.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) _, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) count = count + len(impressions) return final_predictions
def extract_feature(self): def count_freq(x): r = [] y = x[x['action_type'] == 'clickout item'] if len(y) > 0: clk = y.tail(1) x = x[x['step'] < int(clk['step'])] list_impressions = list( x[~x.impressions.isnull()].impressions.values) impressions = ('|'.join(list_impressions)).split('|') impr = clk.impressions.values[0].split('|') for i in impr: r.append((i, impressions.count(i) + 1)) return r train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) df = df.drop([ 'timestamp', 'reference', 'platform', 'city', 'device', 'current_filters', 'prices' ], axis=1) s = df.groupby(['user_id', 'session_id']).progress_apply(count_freq) s = s.apply(pd.Series).reset_index().melt( id_vars=['user_id', 'session_id'], value_name='tuple').sort_values( by=['user_id', 'session_id']).dropna() s[['item_id', 'times_impression_appeared_in_clickouts_session' ]] = pd.DataFrame(s['tuple'].tolist(), index=s.index) s = s.drop(['variable', 'tuple'], axis=1) s = s.reset_index(drop=True) return s
def merge_consecutive_equal_actions(): tqdm.pandas() test = data.test_df('full') test_grouped_by_session_id = test.groupby('session_id') merged = test_grouped_by_session_id.progress_apply( _merge_consecutive_equal_actions) cf.check_folder('dataset/cleaned_csv') merged.to_csv('dataset/cleaned_csv/test.csv')
def train_indices(mode='local', cluster='no_cluster'): df_train = data.train_df(mode=mode, cluster=cluster) df_test = data.test_df(mode=mode, cluster=cluster) target_indices = data.target_indices(mode=mode, cluster=cluster) df = pd.concat([df_train, df_test]) idx = find_last_clickout_indices(df) train_idx = set(idx) - set(target_indices) return train_idx
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) temp = df[['user_id', 'session_id', 'step', 'timestamp']] session_id_l = [] length_step_l = [] length_timestamp_l = [] timestamp_last_action_l = [] final_timestamp_l = [] user_id_l = [] for i in tqdm(idxs_click): user_id = temp.at[i, 'user_id'] session_id = temp.at[i, 'session_id'] step = temp.at[i, 'step'] f_timestamp = temp.at[i, 'timestamp'] i_timestamp = temp.at[i - (step - 1), 'timestamp'] if step > 1: timestamp_last_action = temp.at[i - 1, 'timestamp'] else: timestamp_last_action = f_timestamp user_id_l.append(user_id) session_id_l.append(session_id) length_step_l.append(int(step)) length_timestamp_l.append(int(f_timestamp - i_timestamp)) timestamp_last_action_l.append(int(timestamp_last_action)) final_timestamp_l.append(int(f_timestamp)) final_df = pd.DataFrame({ 'user_id': user_id_l, 'session_id': session_id_l, 'length_step': length_step_l, 'length_timestamp': length_timestamp_l, 'timestamp_last_action': timestamp_last_action_l, 'final_timestamp': final_timestamp_l }) final_df['mean_time_action'] = final_df['length_timestamp'] / final_df[ 'length_step'] final_df['elapsed_last_action_click'] = final_df[ 'final_timestamp'] - final_df['timestamp_last_action'] final_df['elapsed_last_action_click_log'] = np.log( final_df['elapsed_last_action_click'] + 1) final_df['variance_last_action'] = ( final_df['elapsed_last_action_click'] - final_df['mean_time_action'])**2 final_df['std_last_action'] = abs( final_df['elapsed_last_action_click'] - final_df['mean_time_action']) final_df.drop(['timestamp_last_action', 'final_timestamp', 'mean_time_action', \ 'length_step', 'length_timestamp', 'elapsed_last_action_click'], axis=1, inplace=True) return final_df
def fit_predict(self, multithreading=True, save_folder='scores/'): if multithreading: self.scores = Parallel(backend='multiprocessing', n_jobs=-1, max_nbytes=None)(delayed(self._fit_model)(i) for i in range(5)) print(len(self.scores)) else: self.scores = [self._fit_model(i) for i in range(5)] print(len(self.scores)) model = self.model_class(mode=self.mode, cluster='no_cluster', **self.init_params) model.fit() scores_test = model.get_scores_batch() self.scores.append(scores_test) self.scores = [item for sublist in self.scores for item in sublist] scores = pd.DataFrame( self.scores, columns=['index', 'item_recommendations', 'scores']) scores = scores.sort_values(by='index') print(scores) idx_scores = set(scores['index'].values) train_full = data.train_df(mode='full', cluster='no_cluster') test_full = data.test_df(mode='full', cluster='no_cluster') full = pd.concat([train_full, test_full]) full = full[['user_id', 'session_id', 'action_type']] last_clk_full = full.loc[idx_scores] # checking that all rows are clickouts num_not_clk_row = last_clk_full[ last_clk_full['action_type'] != 'clickout item'].shape[0] print(f'Number of not clickout rows is : {num_not_clk_row}') if num_not_clk_row != 0: print("Error, some indices are not clickouts") last_clk_full = last_clk_full.drop(['action_type'], axis=1) last_clk_full['index'] = last_clk_full.index merged = last_clk_full.merge(scores, on=['index']) model_name = model.name df = assign_score(merged, self.model_name) df = df.drop(['index'], axis=1) if save_folder is not None: check_folder(save_folder) filepath = os.path.join(save_folder, model_name + '.csv.gz') print('Saving scores to', filepath, end=' ', flush=True) df.to_csv(filepath, index=False, compression='gzip') print('Done!', flush=True) return df
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) temp = df.fillna('0') idxs_click = sorted(find_last_clickout_indices(temp)) idxs_numeric_reference = temp[temp['reference'].str.isnumeric() == True].index count = 0 last_click = idxs_click[0] impr_features = {} impr_feature = [] for i in tqdm(sorted(idxs_numeric_reference)): if i == last_click: impressions = list(map(int, temp.at[i, 'impressions'].split('|'))) click_timestamp = temp.at[i, 'timestamp'] click_step = temp.at[i, 'step'] for impr in impressions: if impr not in impr_features: impr_feature.append({'num_interactions_impr': 0, 'step_from_last_interaction': -1, 'timestamp_from_last_interaction': -1, 'last_action_type_with_impr': 'None'}) else: impr_features[impr]['timestamp_from_last_interaction'] = click_timestamp - impr_features[impr][ 'timestamp_from_last_interaction'] impr_features[impr]['step_from_last_interaction'] = click_step - impr_features[impr][ 'step_from_last_interaction'] impr_feature.append(impr_features[impr]) impr_features = {} count += 1 if count < len(idxs_click): last_click = idxs_click[count] continue ref = int(temp.at[i, 'reference']) if ref in impr_features: impr_features[ref]['num_interactions_impr'] += 1 impr_features[ref]['step_from_last_interaction'] = df.at[i, 'step'] impr_features[ref]['timestamp_from_last_interaction'] = df.at[i, 'timestamp'] impr_features[ref]['last_action_type_with_impr'] = df.at[i, 'action_type'] else: impr_features[ref] = {'num_interactions_impr': 1, 'step_from_last_interaction': df.at[i, 'step'], 'timestamp_from_last_interaction': df.at[i, 'timestamp'], 'last_action_type_with_impr': df.at[i, 'action_type']} final_df = expand_impressions(temp[['user_id', 'session_id', 'impressions']].loc[idxs_click]) print(len(final_df)) print(len(impr_feature)) final_df['dict'] = impr_feature features_df = pd.DataFrame(final_df.progress_apply(lambda x: tuple(x['dict'].values()), axis=1).tolist(), columns=list(final_df.iloc[0].dict.keys())) final_df_ = pd.concat([final_df, features_df], axis=1).drop('dict', axis=1) final_df_ = final_df_.drop(['num_interactions_impr', 'last_action_type_with_impr'], axis=1) return final_df_
def extract_feature(self): list_of_sorting_filters_wout_pop = [ 'Sort by Price', 'Sort by Distance', 'Sort by Rating', 'Best Value', 'Focus on Rating', 'Focus on Distance' ] list_of_sorting_filters = [ 'Sort by Price', 'Sort by Distance', 'Sort by Rating', 'Best Value', 'Focus on Rating', 'Focus on Distance', 'Sort by Popularity' ] def mask_sorting(x): if np.isin(x, list_of_sorting_filters_wout_pop).any(): return x else: return ['Sort by Popularity'] start = time.time() train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) indices_last_clks = find(df) d = df[df.action_type == 'clickout item'].drop(indices_last_clks) d_splitted = d.current_filters.progress_apply( lambda x: str(x).split('|')) md = d_splitted.progress_apply(mask_sorting) df_f = df.loc[md.index] df_ref = df_f.reference dict_ref_to_filters = dict( zip(df_ref.unique(), [dict(zip(list_of_sorting_filters, np.zeros(len(list_of_sorting_filters))))\ for i in range(len(df_ref.unique()))])) for index, row in tqdm(df_f.iterrows(), total=df_f.shape[0]): for i in md.loc[index]: if i in list_of_sorting_filters: dict_ref_to_filters[row.reference][i] += 1 df_feature = pd.DataFrame.from_dict(dict_ref_to_filters, orient='index') df_feature = df_feature.astype(int).reset_index().rename( index=str, columns={"index": "item_id"}) set_of_not_clicked_items = set(data.accomodations_df().item_id) - set( df_feature.item_id) extension = pd.DataFrame(data=sorted( [i for i in set_of_not_clicked_items]), columns=['item_id']) extd = df_feature.append(extension, ignore_index=True, sort=True) f = extd.fillna(0).reset_index().drop(columns=['index']) feature = f[np.insert(f.columns[:-1].values, 0, f.columns[-1])].astype(int) _time = time.time() - start elapsed = time.strftime('%Mm %Ss', time.gmtime(_time)) print(f"elapsed in: {elapsed}") return feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get all the cities cities = df['city'].unique().tolist() # get clickout rows (WITHOUT last clk) last_indices = find(df) df_non_last_clk = df.drop(last_indices) df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','city']] df_clickout = df_clickout.rename(columns={'reference':'item_id'}) df_clickout = df_clickout.dropna() # remove NaNs, that should not be there anywayss df_clickout.item_id = df_clickout.item_id.astype(int) # open impressions df o = ImpressionFeature(mode='small') df_accomodations = o.read_feature(True) df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1) # get all clicks properties df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id']) df_clicks_properties = df_clicks_properties.sort_values(by=['city']) df_clicks_properties = df_clicks_properties.drop('item_id',1) # sum all properties per city grouped_by_city = df_clicks_properties.groupby('city').sum() # create df with city:array_of_features df_city_features = pd.DataFrame(columns=['city','properties_array']) df_city_features.city = grouped_by_city.index df_city_features.properties_array = grouped_by_city.values.tolist() # now take last clk df clickout_rows = df.loc[last_indices, ['user_id','session_id','city','action_type','impressions']][df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows) clk_expanded_wt_city_feat = pd.merge(clk_expanded, df_city_features, how='left', on=['city']) # create df with item:array_of_features array = df_accomodations.drop(['item_id'],axis=1).values df_item_features = pd.DataFrame(columns=['item_id','features_array']) df_item_features['item_id'] = df_accomodations['item_id'].values df_item_features['features_array'] = list(array) final_feature = pd.merge(clk_expanded_wt_city_feat, df_item_features, how='left', on=['item_id']) for n in tqdm(final_feature[final_feature['properties_array'].isnull()].index.tolist()): final_feature.at[n,'properties_array'] = [0]*152 # cast list to numpy array to use the cosine (it's written for doubles) final_feature.properties_array = final_feature.properties_array.progress_apply(lambda x: np.asarray(x)) # create new column new_col =[] if self.metric == 'cosine': shrink = 0 # TRY ME for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)): new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink)) if self.metric == 'euclidean': for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)): new_col.append(np.linalg.norm(t[0]-t[1])) # final feature new_feature = final_feature[['user_id','session_id','item_id']] new_feature['city_similarity'] = new_col return new_feature
def __init__(self, filepaths, cluster): self.filepaths = filepaths self.cluster = cluster self.target_sessions = list(data.test_df("full", "no_cluster") .iloc[data.target_indices("full", "no_cluster")].session_id.values) #TODO Check if filepaths exists self.absolute_path = 'submission/'
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get ALL clickouts reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type =='clickout item')][['user_id','session_id','reference','impressions']] # get last clickout last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']] clk_expanded = expand_impressions(clickout_rows) # get the impressions impression_lists = reference_rows.impressions.str.split('|').tolist() big_list = [x for l in impression_lists for x in l] c = dict(Counter(big_list)) df_times_in_impressions = pd.DataFrame.from_dict(c, orient='index',columns=['number_of_times_in_impr']) df_times_in_impressions['item_id'] = df_times_in_impressions.index.astype(int) df_times_in_impressions = df_times_in_impressions.reindex(columns = ['item_id', 'number_of_times_in_impr']) feature_times_per_imp = pd.merge(clk_expanded, df_times_in_impressions, how='left', on=['item_id']).fillna(0) feature_times_per_imp.number_of_times_in_impr = feature_times_per_imp.number_of_times_in_impr.astype(int) feature_times_per_imp = feature_times_per_imp[['user_id', 'session_id','item_id','number_of_times_in_impr']] df_item_clicks = ( reference_rows .groupby(["reference"]) .size() .reset_index(name="n_clickouts") ) df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) merged = pd.merge(df_times_in_impressions, df_item_clicks, how='left', on=['item_id']).fillna(0) merged.n_clickouts = merged.n_clickouts.astype(int) final_feature = pd.merge(clk_expanded, merged, how='left', on=['item_id']).fillna(0) new_col = [] final_feature.reference = final_feature.reference.astype(int) final_feature.item_id = final_feature.item_id.astype(int) for t in tqdm(zip(final_feature.reference, final_feature.item_id, final_feature.number_of_times_in_impr, final_feature.n_clickouts)): if t[0]==t[1]: # stessa reference, quindi decremento di 1 sia #click che #imp if t[2]!=1: new_col.append(round(((t[3]-1)*100)/(t[2]-1),5)) else: new_col.append(0) else: if 0 not in [t[2],t[3]] and t[2]!=1: new_col.append(round(((t[3])*100)/(t[2]-1),5)) else: new_col.append(0) final_feature['adj_perc_click_appeared'] = new_col final_feature = final_feature[['user_id','session_id','item_id','adj_perc_click_appeared']] return final_feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) df = df.loc[idxs_click][[ 'user_id', 'session_id', 'impressions', 'prices' ]] impression_price_position_list = [] fraction_pos_price_list = [] for i in tqdm(df.index): impr = list(map(int, df.at[i, 'impressions'].split('|'))) prices = list(map(int, df.at[i, 'prices'].split('|'))) impression_position = np.arange(len(impr)) + 1 couples = zip(prices, impression_position, impr) couples = sorted(couples, key=lambda a: a[0]) prices_ordered, position, impressions_ordered = zip(*couples) _, price_pos = list( zip(*sorted(list(zip(position, impression_position)), key=lambda a: a[0]))) fraction_pos_price = list(impression_position / price_pos) fraction_pos_price_list.append(np.array(fraction_pos_price)) impression_price_position_list.append(np.array(price_pos)) df['impression_pos_price'] = impression_price_position_list df['impressions'] = df['impressions'].str.split('|') df['prices'] = df['prices'].str.split('|') final_df = pd.DataFrame({ col: np.repeat(df[col], df['impressions'].str.len()) for col in df.columns.drop(['impressions', 'prices']) }).assign( **{ 'item_id': np.concatenate(df['impressions'].values), 'price': np.concatenate(df['prices'].values), 'impression_pos_price': np.concatenate(df['impression_pos_price'].values) }) final_df['item_id'] = pd.to_numeric(final_df['item_id']) final_df['impression_pos_price'] = pd.to_numeric( final_df['impression_pos_price']) final_df['price'] = pd.to_numeric(final_df['price']) return final_df
def extract_feature(self): def get_pos(item, rec): res = np.empty(item.shape) for i in tqdm(range(len(item))): if str(item[i]) in rec[i]: res[i] = rec[i].index(str(item[i])) + 1 else: res[i] = -1 return res.astype(int) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) all_clk_rows = df[df.reference.str.isnumeric()==True][df.action_type == 'clickout item'] all_clk_rows = all_clk_rows [['user_id','session_id','reference','impressions']] all_clk_rows.impressions = all_clk_rows.impressions.str.split('|') pos_col = get_pos(all_clk_rows.reference.values,all_clk_rows.impressions.values) all_clk_rows = all_clk_rows.drop('impressions',1) all_clk_rows['position'] = pos_col all_clk_rows_after_1 = all_clk_rows[all_clk_rows.position>1] df_clicks_after_1 = ( all_clk_rows_after_1 .groupby(["reference"]) .size() .reset_index(name="n_clicks_per_item") ) df_clicks_after_1.reference = df_clicks_after_1.reference.astype(int) df_clicks_after_1 = df_clicks_after_1.rename(columns={'reference':'item_id'}) last_clk_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']] last_clk_rows['imp_list'] = last_clk_rows.impressions.str.split('|') clk_expanded = expand_impressions(last_clk_rows) clk_expanded = clk_expanded.drop('index',1) pos_col = get_pos(clk_expanded.item_id.values,clk_expanded.imp_list.values) clk_expanded['position'] = pos_col clk_expanded = clk_expanded.drop('imp_list',1) merged = pd.merge(clk_expanded, df_clicks_after_1, how='left',on='item_id').fillna(0) new_col = [] merged.item_id = merged.item_id.astype(int) merged.reference = merged.reference.astype(int) for t in tqdm(zip(merged.item_id, merged.reference, merged.position, merged.n_clicks_per_item)): if t[0]==t[1] and t[2]>1: new_col.append(int(t[3]-1)) else: new_col.append(int(t[3])) merged['n_clicks_after_first_pos'] = new_col feature = merged[['user_id','session_id','item_id','n_clicks_after_first_pos']] return feature