Ejemplo n.º 1
0
class Evaluator(object):
    def __init__(self):
        self.URM_train = None
        self.URM_test = None
        self.dict_test = None
        self.target_playlists = None
        self.train_test_split = 0.80
        self.at = 10
        self.tracks_df = pd.read_csv("../data/tracks.csv")

        self.starting_weight_low = np.array([0.22, 0.14, 0.215, 0.45])

    def get_URM_train(self):
        return self.URM_train

    def get_dict_test(self):
        return self.dict_test

    def get_target_playlists(self):
        return self.target_playlists

    def get_URM_test(self):
        return self.URM_test

    #Split totally randomic, no cluster
    def split_randomic(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        n_target = len(available_playlists) * (
            1 - self.train_test_split
        )  #respecting the submission file we have to post
        while n_target > 0:
            random_index = randint(0, len(available_playlists) - 1)

            #chosen playlist_id
            playlist_id = available_playlists[random_index]
            tracks_left = len(URM[playlist_id].data)
            if tracks_left > 7:
                available_playlists = np.delete(
                    available_playlists,
                    np.where(available_playlists == playlist_id))
                selected_playlists = np.append(selected_playlists, playlist_id)
                n_target -= 1

        self.target_playlists = selected_playlists.astype(int)

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)
        count = 0

        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks
            count += 1

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    # Split considering all playlists EXCEPT TARGET (because they will be surely afflicted by Kaggle test split)
    def split_randomic_all_playlists(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])

        helper = Helper()
        target_playlists_kaggle = helper.get_target_playlists_list()
        for playlist_id in available_playlists:
            if playlist_id not in target_playlists_kaggle:
                selected_playlists = np.append(selected_playlists, playlist_id)

        self.target_playlists = selected_playlists.astype(int)

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    def split_randomic_all_playlists_longer(self,
                                            URM,
                                            URM_df,
                                            threshold_length=10):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])

        helper = Helper()
        target_playlists_kaggle = helper.get_target_playlists_list()
        for playlist_id in available_playlists:
            if playlist_id not in target_playlists_kaggle and len(
                    URM[playlist_id].indices) > threshold_length:
                selected_playlists = np.append(selected_playlists, playlist_id)

        self.target_playlists = selected_playlists.astype(int)

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    def split_randomic_all_playlists_longer_10000(self,
                                                  URM,
                                                  URM_df,
                                                  threshold_length=10):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])

        helper = Helper()
        target_playlists_kaggle = helper.get_target_playlists_list()
        for playlist_id in available_playlists:
            if len(selected_playlists) == 10000:
                break
            if playlist_id not in target_playlists_kaggle and len(
                    URM[playlist_id].indices) > threshold_length:
                selected_playlists = np.append(selected_playlists, playlist_id)

        self.target_playlists = selected_playlists.astype(int)

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    def split_randomic_all_playlists_shorter(self,
                                             URM,
                                             URM_df,
                                             threshold_length=10):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])

        helper = Helper()
        target_playlists_kaggle = helper.get_target_playlists_list()
        for playlist_id in available_playlists:
            if playlist_id not in target_playlists_kaggle and len(
                    URM[playlist_id].indices) <= threshold_length:
                selected_playlists = np.append(selected_playlists, playlist_id)

        self.target_playlists = selected_playlists.astype(int)

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    def split_randomic_exactly_last(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])

        helper = Helper()

        self.target_playlists = helper.get_target_playlists_list()[5000:]
        selected_playlists = self.target_playlists
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)
        count = 0
        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])
            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks
            count += 1

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    #Split getting 5000 sequentially, 5000 randomic with cluster segment = 1
    def split_sequential(self, URM, URM_df):
        segment = 1
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()

        #Gets distribution of only last 5000 playlists
        dist = target_analyzer.get_distribution_array_only_last(segment)

        helper = Helper()
        target_playlists = helper.get_target_playlists_list()[:5000]
        #n_target = np.sum(dist) - len(target_playlists)

        # Removing from the cluster distribution the len of the sequential target
        for playlist_id in target_playlists:
            playlist_id = int(playlist_id)
            available_playlists = np.delete(
                available_playlists,
                np.where(available_playlists == playlist_id))
            selected_playlists = np.append(selected_playlists, playlist_id)
            #target_len = len(URM[playlist_id].data)
            #dist[target_len] -= 1

        print("Clustering with segment = " + str(segment))
        for key in tqdm(range(len(dist))):
            while dist[key] != 0:
                random_index = randint(0, len(available_playlists) - 1)
                playlist_id = available_playlists[random_index]
                target_segment = int(0.8 * len(URM[playlist_id].data))
                if target_segment == key:
                    available_playlists = np.delete(
                        available_playlists,
                        np.where(available_playlists == playlist_id))
                    selected_playlists = np.append(selected_playlists,
                                                   playlist_id)
                    dist[key] -= 1

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            #Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])
            if playlist_id in target_playlists:
                to_be_removed = int(len(tracks) * 0.2)

                #Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks
                to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(
                    playlist_id)[-to_be_removed:]
                for track in to_be_removed_tracks:
                    relevant_items[playlist_id].append(track)
                    tracks = np.delete(tracks, np.where(tracks == track))
            else:
                to_be_removed = int(len(tracks) * 0.2)
                for i in range(to_be_removed):
                    index = randint(0, len(tracks) - 1)
                    removed_track = tracks[index]
                    relevant_items[playlist_id].append(removed_track)
                    tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    def split_only_sequential(self, URM, URM_df):

        helper = Helper()

        sequential_playlists = helper.get_target_playlists_list()[:5000]
        selected_playlists = np.array([])

        self.target_playlists = sequential_playlists

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)
        for playlist_id in sequential_playlists:

            # Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])
            to_be_removed = int(len(tracks) * 0.2)

            # Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks
            to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(
                playlist_id)[-to_be_removed:]
            for track in to_be_removed_tracks:
                relevant_items[playlist_id].append(track)
                tracks = np.delete(tracks, np.where(tracks == track))
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    # OLD_SPLIT (Piccio)
    def split(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()
        segment_size = 2
        min_playlist_len_after_split = 5
        dist = target_analyzer.get_distribution_array(
            segment_size=segment_size)
        # in this way n_target = 10000
        helper = Helper()
        target_playlists = helper.get_target_playlists_list()[:5000]
        n_target = np.sum(dist)  # - len(target_playlists)

        while n_target > 0:
            random_index = randint(0, len(available_playlists) - 1)
            playlist_id = available_playlists[random_index]
            target_len = len(URM[playlist_id].data) * 0.8
            if target_len > min_playlist_len_after_split:
                target_segment = int(target_len / segment_size)
                while dist[target_segment] <= 0:
                    random_index = randint(0, len(available_playlists) - 1)
                    playlist_id = available_playlists[random_index]
                    target_len = len(URM[playlist_id].data) * 0.8
                    if target_len > min_playlist_len_after_split:
                        target_segment = int(target_len / segment_size)
                n_target -= 1
                dist[target_segment] -= 1
                selected_playlists = np.append(selected_playlists, playlist_id)
                available_playlists = np.delete(
                    available_playlists,
                    np.where(available_playlists == playlist_id))

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        grouped_test = grouped.copy()

        relevant_items = defaultdict(list)
        count = 0
        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks
            grouped_test[playlist_id] = relevant_items[playlist_id]
            count += 1
        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.dict_test = relevant_items
        self.URM_test = MultiLabelBinarizer(
            classes=all_tracks, sparse_output=True).fit_transform(grouped_test)
        self.URM_test = self.URM_test.tocsr()
        self.URM_test = self.URM_test.astype(np.float64)
        self.URM_train = self.URM_train.astype(np.float64)

    #Get all randomic playlists applying cluster, segment = 2
    def split_cluster_randomic(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()
        segment_size = 1
        min_playlist_len_after_split = 5
        dist = target_analyzer.get_distribution_array(
            segment_size=segment_size)
        # in this way n_target = 10000
        helper = Helper()
        target_playlists = helper.get_target_playlists_list()[:5000]
        n_target = np.sum(dist)  # - len(target_playlists)

        while n_target > 0:
            random_index = randint(0, len(available_playlists) - 1)
            playlist_id = available_playlists[random_index]
            target_len = len(URM[playlist_id].data) * 0.8
            if target_len > min_playlist_len_after_split:
                target_segment = int(target_len / segment_size)
                while dist[target_segment] <= 0:
                    random_index = randint(0, len(available_playlists) - 1)
                    playlist_id = available_playlists[random_index]
                    target_len = len(URM[playlist_id].data) * 0.8
                    if target_len > min_playlist_len_after_split:
                        target_segment = int(target_len / segment_size)
                n_target -= 1
                dist[target_segment] -= 1
                selected_playlists = np.append(selected_playlists, playlist_id)
                available_playlists = np.delete(
                    available_playlists,
                    np.where(available_playlists == playlist_id))

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        grouped_test = grouped.copy()

        relevant_items = defaultdict(list)
        count = 0
        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])
            # if playlist_id in target_playlists:
            # to_be_removed = int(len(tracks) * 0.2)
            # to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(playlist_id)[-to_be_removed:]
            # for track in to_be_removed_tracks:
            #     relevant_items[playlist_id].append(track)
            #     tracks = np.delete(tracks, np.where(tracks == track))
            # for i in range(to_be_removed):
            #    removed_track = tracks[-1]
            #    relevant_items[playlist_id].append(removed_track)
            #    tracks = np.delete(tracks, len(tracks) - 1)
            # else:
            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks
            grouped_test[playlist_id] = relevant_items[playlist_id]
            count += 1
        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.dict_test = relevant_items
        # bib URM
        # self.URM_train = helper.get_urm_csr_bib(URM = self.URM_train)
        # plotter = TargetAnalyzer()
        # plotter.plot_standard_distribution()
        # plotter.plot_distribution(self.URM_train, self.target_playlists)
        self.URM_test = MultiLabelBinarizer(
            classes=all_tracks, sparse_output=True).fit_transform(grouped_test)
        self.URM_test = self.URM_test.tocsr()
        self.URM_test = self.URM_test.astype(np.float64)
        self.URM_train = self.URM_train.astype(np.float64)

        # Get all randomic playlists of LAST 5000 targret playlists applying cluster, segment = 1

    def split_cluster_randomic_only_last(self, URM, URM_df):
        # splitting URM in test set e train set
        segment = 1
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()

        # Gets distribution of only last 5000 playlists
        dist = target_analyzer.get_distribution_array_only_last(segment)

        helper = Helper()
        target_playlists = helper.get_target_playlists_list(
        )[:5000]  # WILL REMOVE THEM

        print("Clustering with segment = " + str(segment))
        for key in tqdm(range(len(dist))):
            while dist[key] != 0:
                random_index = randint(0, len(available_playlists) - 1)
                playlist_id = available_playlists[random_index]
                target_segment = int(0.8 * len(URM[playlist_id].data))
                if target_segment == key and playlist_id not in target_playlists:
                    available_playlists = np.delete(
                        available_playlists,
                        np.where(available_playlists == playlist_id))
                    selected_playlists = np.append(selected_playlists,
                                                   playlist_id)
                    dist[key] -= 1

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            # Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items

    def MAP(self, recommended_items, relevant_items):

        is_relevant = np.in1d(recommended_items,
                              relevant_items,
                              assume_unique=True)

        # Cumulative sum: precision at 1, at 2, at 3 ...
        p_at_k = is_relevant * np.cumsum(
            is_relevant, dtype=np.float32) / (1 + np.arange(len(is_relevant)))
        map_score = np.sum(p_at_k) / np.min(
            [len(relevant_items), len(is_relevant)])
        return map_score

    def evaluate(self, playlist_id, recommended_items):
        relevant_items = self.dict_test[playlist_id]
        map = self.MAP(recommended_items, relevant_items)
        return map

    #evaluates just weights assigned in the run method. No array weights neede
    def global_evaluate_single(self, recommender):
        MAP_final = 0
        recommender.fit(self.URM_train)
        count = 0
        for target in tqdm(self.target_playlists):
            recommended_items = recommender.recommend(target)
            MAP_final += self.evaluate(target, recommended_items)
            count += 1
        MAP_final /= len(self.target_playlists)
        return MAP_final