def split_only_sequential(self, URM, URM_df): helper = Helper() sequential_playlists = helper.get_target_playlists_list()[:5000] selected_playlists = np.array([]) self.target_playlists = sequential_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in sequential_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) # Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_sequential(self, URM, URM_df): segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() #Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] #n_target = np.sum(dist) - len(target_playlists) # Removing from the cluster distribution the len of the sequential target for playlist_id in target_playlists: playlist_id = int(playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) #target_len = len(URM[playlist_id].data) #dist[target_len] -= 1 print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: #Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) if playlist_id in target_playlists: to_be_removed = int(len(tracks) * 0.2) #Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items