def split_only_sequential(self, URM, URM_df): helper = Helper() sequential_playlists = helper.get_target_playlists_list()[:5000] selected_playlists = np.array([]) self.target_playlists = sequential_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in sequential_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) # Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_randomic_exactly_last(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) helper = Helper() self.target_playlists = helper.get_target_playlists_list()[5000:] selected_playlists = self.target_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_cluster_randomic_only_last(self, URM, URM_df): # splitting URM in test set e train set segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() # Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list( )[:5000] # WILL REMOVE THEM print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key and playlist_id not in target_playlists: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_randomic_all_playlists_longer_10000(self, URM, URM_df, threshold_length=10): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) helper = Helper() target_playlists_kaggle = helper.get_target_playlists_list() for playlist_id in available_playlists: if len(selected_playlists) == 10000: break if playlist_id not in target_playlists_kaggle and len( URM[playlist_id].indices) > threshold_length: selected_playlists = np.append(selected_playlists, playlist_id) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_cluster_randomic(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() segment_size = 1 min_playlist_len_after_split = 5 dist = target_analyzer.get_distribution_array( segment_size=segment_size) # in this way n_target = 10000 helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] n_target = np.sum(dist) # - len(target_playlists) while n_target > 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) while dist[target_segment] <= 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) n_target -= 1 dist[target_segment] -= 1 selected_playlists = np.append(selected_playlists, playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) grouped_test = grouped.copy() relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) # if playlist_id in target_playlists: # to_be_removed = int(len(tracks) * 0.2) # to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(playlist_id)[-to_be_removed:] # for track in to_be_removed_tracks: # relevant_items[playlist_id].append(track) # tracks = np.delete(tracks, np.where(tracks == track)) # for i in range(to_be_removed): # removed_track = tracks[-1] # relevant_items[playlist_id].append(removed_track) # tracks = np.delete(tracks, len(tracks) - 1) # else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks grouped_test[playlist_id] = relevant_items[playlist_id] count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.dict_test = relevant_items # bib URM # self.URM_train = helper.get_urm_csr_bib(URM = self.URM_train) # plotter = TargetAnalyzer() # plotter.plot_standard_distribution() # plotter.plot_distribution(self.URM_train, self.target_playlists) self.URM_test = MultiLabelBinarizer( classes=all_tracks, sparse_output=True).fit_transform(grouped_test) self.URM_test = self.URM_test.tocsr() self.URM_test = self.URM_test.astype(np.float64) self.URM_train = self.URM_train.astype(np.float64)
def split_sequential(self, URM, URM_df): segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() #Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] #n_target = np.sum(dist) - len(target_playlists) # Removing from the cluster distribution the len of the sequential target for playlist_id in target_playlists: playlist_id = int(playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) #target_len = len(URM[playlist_id].data) #dist[target_len] -= 1 print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: #Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) if playlist_id in target_playlists: to_be_removed = int(len(tracks) * 0.2) #Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items