def split_only_sequential(self, URM, URM_df):

        helper = Helper()

        sequential_playlists = helper.get_target_playlists_list()[:5000]
        selected_playlists = np.array([])

        self.target_playlists = sequential_playlists

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)
        for playlist_id in sequential_playlists:

            # Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])
            to_be_removed = int(len(tracks) * 0.2)

            # Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks
            to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(
                playlist_id)[-to_be_removed:]
            for track in to_be_removed_tracks:
                relevant_items[playlist_id].append(track)
                tracks = np.delete(tracks, np.where(tracks == track))
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
    def split_randomic_exactly_last(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])

        helper = Helper()

        self.target_playlists = helper.get_target_playlists_list()[5000:]
        selected_playlists = self.target_playlists
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)
        count = 0
        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])
            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks
            count += 1

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
    def split_cluster_randomic_only_last(self, URM, URM_df):
        # splitting URM in test set e train set
        segment = 1
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()

        # Gets distribution of only last 5000 playlists
        dist = target_analyzer.get_distribution_array_only_last(segment)

        helper = Helper()
        target_playlists = helper.get_target_playlists_list(
        )[:5000]  # WILL REMOVE THEM

        print("Clustering with segment = " + str(segment))
        for key in tqdm(range(len(dist))):
            while dist[key] != 0:
                random_index = randint(0, len(available_playlists) - 1)
                playlist_id = available_playlists[random_index]
                target_segment = int(0.8 * len(URM[playlist_id].data))
                if target_segment == key and playlist_id not in target_playlists:
                    available_playlists = np.delete(
                        available_playlists,
                        np.where(available_playlists == playlist_id))
                    selected_playlists = np.append(selected_playlists,
                                                   playlist_id)
                    dist[key] -= 1

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            # Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
    def split_randomic_all_playlists_longer_10000(self,
                                                  URM,
                                                  URM_df,
                                                  threshold_length=10):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])

        helper = Helper()
        target_playlists_kaggle = helper.get_target_playlists_list()
        for playlist_id in available_playlists:
            if len(selected_playlists) == 10000:
                break
            if playlist_id not in target_playlists_kaggle and len(
                    URM[playlist_id].indices) > threshold_length:
                selected_playlists = np.append(selected_playlists, playlist_id)

        self.target_playlists = selected_playlists.astype(int)

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
    def split_cluster_randomic(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()
        segment_size = 1
        min_playlist_len_after_split = 5
        dist = target_analyzer.get_distribution_array(
            segment_size=segment_size)
        # in this way n_target = 10000
        helper = Helper()
        target_playlists = helper.get_target_playlists_list()[:5000]
        n_target = np.sum(dist)  # - len(target_playlists)

        while n_target > 0:
            random_index = randint(0, len(available_playlists) - 1)
            playlist_id = available_playlists[random_index]
            target_len = len(URM[playlist_id].data) * 0.8
            if target_len > min_playlist_len_after_split:
                target_segment = int(target_len / segment_size)
                while dist[target_segment] <= 0:
                    random_index = randint(0, len(available_playlists) - 1)
                    playlist_id = available_playlists[random_index]
                    target_len = len(URM[playlist_id].data) * 0.8
                    if target_len > min_playlist_len_after_split:
                        target_segment = int(target_len / segment_size)
                n_target -= 1
                dist[target_segment] -= 1
                selected_playlists = np.append(selected_playlists, playlist_id)
                available_playlists = np.delete(
                    available_playlists,
                    np.where(available_playlists == playlist_id))

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        grouped_test = grouped.copy()

        relevant_items = defaultdict(list)
        count = 0
        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])
            # if playlist_id in target_playlists:
            # to_be_removed = int(len(tracks) * 0.2)
            # to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(playlist_id)[-to_be_removed:]
            # for track in to_be_removed_tracks:
            #     relevant_items[playlist_id].append(track)
            #     tracks = np.delete(tracks, np.where(tracks == track))
            # for i in range(to_be_removed):
            #    removed_track = tracks[-1]
            #    relevant_items[playlist_id].append(removed_track)
            #    tracks = np.delete(tracks, len(tracks) - 1)
            # else:
            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks
            grouped_test[playlist_id] = relevant_items[playlist_id]
            count += 1
        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.dict_test = relevant_items
        # bib URM
        # self.URM_train = helper.get_urm_csr_bib(URM = self.URM_train)
        # plotter = TargetAnalyzer()
        # plotter.plot_standard_distribution()
        # plotter.plot_distribution(self.URM_train, self.target_playlists)
        self.URM_test = MultiLabelBinarizer(
            classes=all_tracks, sparse_output=True).fit_transform(grouped_test)
        self.URM_test = self.URM_test.tocsr()
        self.URM_test = self.URM_test.astype(np.float64)
        self.URM_train = self.URM_train.astype(np.float64)
    def split_sequential(self, URM, URM_df):
        segment = 1
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()

        #Gets distribution of only last 5000 playlists
        dist = target_analyzer.get_distribution_array_only_last(segment)

        helper = Helper()
        target_playlists = helper.get_target_playlists_list()[:5000]
        #n_target = np.sum(dist) - len(target_playlists)

        # Removing from the cluster distribution the len of the sequential target
        for playlist_id in target_playlists:
            playlist_id = int(playlist_id)
            available_playlists = np.delete(
                available_playlists,
                np.where(available_playlists == playlist_id))
            selected_playlists = np.append(selected_playlists, playlist_id)
            #target_len = len(URM[playlist_id].data)
            #dist[target_len] -= 1

        print("Clustering with segment = " + str(segment))
        for key in tqdm(range(len(dist))):
            while dist[key] != 0:
                random_index = randint(0, len(available_playlists) - 1)
                playlist_id = available_playlists[random_index]
                target_segment = int(0.8 * len(URM[playlist_id].data))
                if target_segment == key:
                    available_playlists = np.delete(
                        available_playlists,
                        np.where(available_playlists == playlist_id))
                    selected_playlists = np.append(selected_playlists,
                                                   playlist_id)
                    dist[key] -= 1

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            #Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])
            if playlist_id in target_playlists:
                to_be_removed = int(len(tracks) * 0.2)

                #Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks
                to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(
                    playlist_id)[-to_be_removed:]
                for track in to_be_removed_tracks:
                    relevant_items[playlist_id].append(track)
                    tracks = np.delete(tracks, np.where(tracks == track))
            else:
                to_be_removed = int(len(tracks) * 0.2)
                for i in range(to_be_removed):
                    index = randint(0, len(tracks) - 1)
                    removed_track = tracks[index]
                    relevant_items[playlist_id].append(removed_track)
                    tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items