def split_randomic_exactly_last(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])

        helper = Helper()

        self.target_playlists = helper.get_target_playlists_list()[5000:]
        selected_playlists = self.target_playlists
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)
        count = 0
        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])
            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks
            count += 1

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
    def split_only_sequential(self, URM, URM_df):

        helper = Helper()

        sequential_playlists = helper.get_target_playlists_list()[:5000]
        selected_playlists = np.array([])

        self.target_playlists = sequential_playlists

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)
        for playlist_id in sequential_playlists:

            # Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])
            to_be_removed = int(len(tracks) * 0.2)

            # Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks
            to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(
                playlist_id)[-to_be_removed:]
            for track in to_be_removed_tracks:
                relevant_items[playlist_id].append(track)
                tracks = np.delete(tracks, np.where(tracks == track))
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
Example #3
0
    def recommend(self, playlist_id, at=10):
        playlist_id = int(playlist_id)
        helper = Helper()

        ### DUE TO TIME CONSTRAINT THE CODE STRUCTURE HERE IS REDUNTANT
        ### TODO exploit inheritance to reduce code duplications and simple extract ratings, combine them, simply by iterate over a list of recommenders

        ### COMMON CODE ###
        self.hybrid_ratings = None  #BE CAREFUL, MAGIC INSIDE :)

        ### COMBINE RATINGS IN DIFFERENT WAYS (seq, random short, random long)
        if (helper.is_sequential(playlist_id)):
            self.userCF_ratings = self.userCF_sequential.get_expected_ratings(
                playlist_id)
            self.itemCF_ratings = self.itemCF_sequential.get_expected_ratings(
                playlist_id)
            self.cbf_ratings = self.cbf_sequential.get_expected_ratings(
                playlist_id)
            self.slim_elastic_ratings = self.slim_elastic_sequential.get_expected_ratings(
                playlist_id)
            # self.svd_icm_ratings = self.svd_icm_sequential.get_expected_ratings(playlist_id)
            self.ALS_ratings = self.ALS_sequential.get_expected_ratings(
                playlist_id)
            self.slim_ratings = self.slim_sequential.get_expected_ratings(
                playlist_id)
            w_right = self.w_seq
        else:
            self.userCF_ratings = self.userCF.get_expected_ratings(playlist_id)
            self.itemCF_ratings = self.itemCF.get_expected_ratings(playlist_id)
            self.cbf_ratings = self.cbf.get_expected_ratings(playlist_id)
            self.slim_elastic_ratings = self.slim_elastic.get_expected_ratings(
                playlist_id)
            # self.svd_icm_ratings = self.svd_icm.get_expected_ratings(playlist_id)
            self.ALS_ratings = self.ALS.get_expected_ratings(playlist_id)
            self.slim_ratings = self.slim_random.get_expected_ratings(
                playlist_id)
            if len(self.URM[playlist_id].indices) > 10:
                w_right = self.w_long
            else:
                w_right = self.w_short

        self.hybrid_ratings = self.userCF_ratings * w_right["user_cf"]
        self.hybrid_ratings += self.itemCF_ratings * w_right["item_cf"]
        self.hybrid_ratings += self.cbf_ratings * w_right["cbf"]
        self.hybrid_ratings += self.slim_ratings * w_right["slim"]
        # self.hybrid_ratings += self.svd_icm_ratings * w_right["svd_icm"]
        self.hybrid_ratings += self.ALS_ratings * w_right["als"]
        self.hybrid_ratings += self.slim_elastic_ratings * w_right["elastic"]

        recommended_items = np.flip(np.argsort(self.hybrid_ratings), 0)

        # REMOVING SEEN
        unseen_items_mask = np.in1d(recommended_items,
                                    self.URM[playlist_id].indices,
                                    assume_unique=True,
                                    invert=True)
        recommended_items = recommended_items[unseen_items_mask]

        return recommended_items[0:at]
    def split_cluster_randomic_only_last(self, URM, URM_df):
        # splitting URM in test set e train set
        segment = 1
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()

        # Gets distribution of only last 5000 playlists
        dist = target_analyzer.get_distribution_array_only_last(segment)

        helper = Helper()
        target_playlists = helper.get_target_playlists_list(
        )[:5000]  # WILL REMOVE THEM

        print("Clustering with segment = " + str(segment))
        for key in tqdm(range(len(dist))):
            while dist[key] != 0:
                random_index = randint(0, len(available_playlists) - 1)
                playlist_id = available_playlists[random_index]
                target_segment = int(0.8 * len(URM[playlist_id].data))
                if target_segment == key and playlist_id not in target_playlists:
                    available_playlists = np.delete(
                        available_playlists,
                        np.where(available_playlists == playlist_id))
                    selected_playlists = np.append(selected_playlists,
                                                   playlist_id)
                    dist[key] -= 1

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            # Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
    def split_randomic_all_playlists_longer_10000(self,
                                                  URM,
                                                  URM_df,
                                                  threshold_length=10):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])

        helper = Helper()
        target_playlists_kaggle = helper.get_target_playlists_list()
        for playlist_id in available_playlists:
            if len(selected_playlists) == 10000:
                break
            if playlist_id not in target_playlists_kaggle and len(
                    URM[playlist_id].indices) > threshold_length:
                selected_playlists = np.append(selected_playlists, playlist_id)

        self.target_playlists = selected_playlists.astype(int)

        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])

            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()

        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
    def split_cluster_randomic(self, URM, URM_df):
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()
        segment_size = 1
        min_playlist_len_after_split = 5
        dist = target_analyzer.get_distribution_array(
            segment_size=segment_size)
        # in this way n_target = 10000
        helper = Helper()
        target_playlists = helper.get_target_playlists_list()[:5000]
        n_target = np.sum(dist)  # - len(target_playlists)

        while n_target > 0:
            random_index = randint(0, len(available_playlists) - 1)
            playlist_id = available_playlists[random_index]
            target_len = len(URM[playlist_id].data) * 0.8
            if target_len > min_playlist_len_after_split:
                target_segment = int(target_len / segment_size)
                while dist[target_segment] <= 0:
                    random_index = randint(0, len(available_playlists) - 1)
                    playlist_id = available_playlists[random_index]
                    target_len = len(URM[playlist_id].data) * 0.8
                    if target_len > min_playlist_len_after_split:
                        target_segment = int(target_len / segment_size)
                n_target -= 1
                dist[target_segment] -= 1
                selected_playlists = np.append(selected_playlists, playlist_id)
                available_playlists = np.delete(
                    available_playlists,
                    np.where(available_playlists == playlist_id))

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        grouped_test = grouped.copy()

        relevant_items = defaultdict(list)
        count = 0
        for playlist_id in selected_playlists:
            tracks = np.array(grouped[playlist_id])
            # if playlist_id in target_playlists:
            # to_be_removed = int(len(tracks) * 0.2)
            # to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(playlist_id)[-to_be_removed:]
            # for track in to_be_removed_tracks:
            #     relevant_items[playlist_id].append(track)
            #     tracks = np.delete(tracks, np.where(tracks == track))
            # for i in range(to_be_removed):
            #    removed_track = tracks[-1]
            #    relevant_items[playlist_id].append(removed_track)
            #    tracks = np.delete(tracks, len(tracks) - 1)
            # else:
            to_be_removed = int(len(tracks) * 0.2)
            for i in range(to_be_removed):
                index = randint(0, len(tracks) - 1)
                removed_track = tracks[index]
                relevant_items[playlist_id].append(removed_track)
                tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks
            grouped_test[playlist_id] = relevant_items[playlist_id]
            count += 1
        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.dict_test = relevant_items
        # bib URM
        # self.URM_train = helper.get_urm_csr_bib(URM = self.URM_train)
        # plotter = TargetAnalyzer()
        # plotter.plot_standard_distribution()
        # plotter.plot_distribution(self.URM_train, self.target_playlists)
        self.URM_test = MultiLabelBinarizer(
            classes=all_tracks, sparse_output=True).fit_transform(grouped_test)
        self.URM_test = self.URM_test.tocsr()
        self.URM_test = self.URM_test.astype(np.float64)
        self.URM_train = self.URM_train.astype(np.float64)
    def split_sequential(self, URM, URM_df):
        segment = 1
        # splitting URM in test set e train set
        selected_playlists = np.array([])
        available_playlists = np.arange(URM.shape[0])
        target_analyzer = TargetAnalyzer()

        #Gets distribution of only last 5000 playlists
        dist = target_analyzer.get_distribution_array_only_last(segment)

        helper = Helper()
        target_playlists = helper.get_target_playlists_list()[:5000]
        #n_target = np.sum(dist) - len(target_playlists)

        # Removing from the cluster distribution the len of the sequential target
        for playlist_id in target_playlists:
            playlist_id = int(playlist_id)
            available_playlists = np.delete(
                available_playlists,
                np.where(available_playlists == playlist_id))
            selected_playlists = np.append(selected_playlists, playlist_id)
            #target_len = len(URM[playlist_id].data)
            #dist[target_len] -= 1

        print("Clustering with segment = " + str(segment))
        for key in tqdm(range(len(dist))):
            while dist[key] != 0:
                random_index = randint(0, len(available_playlists) - 1)
                playlist_id = available_playlists[random_index]
                target_segment = int(0.8 * len(URM[playlist_id].data))
                if target_segment == key:
                    available_playlists = np.delete(
                        available_playlists,
                        np.where(available_playlists == playlist_id))
                    selected_playlists = np.append(selected_playlists,
                                                   playlist_id)
                    dist[key] -= 1

        self.target_playlists = selected_playlists.astype(int)
        grouped = URM_df.groupby(
            'playlist_id', as_index=True).apply(lambda x: list(x['track_id']))

        relevant_items = defaultdict(list)

        for playlist_id in selected_playlists:
            #Tracks = lista delle tracks prese dalla URM
            tracks = np.array(grouped[playlist_id])
            if playlist_id in target_playlists:
                to_be_removed = int(len(tracks) * 0.2)

                #Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks
                to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(
                    playlist_id)[-to_be_removed:]
                for track in to_be_removed_tracks:
                    relevant_items[playlist_id].append(track)
                    tracks = np.delete(tracks, np.where(tracks == track))
            else:
                to_be_removed = int(len(tracks) * 0.2)
                for i in range(to_be_removed):
                    index = randint(0, len(tracks) - 1)
                    removed_track = tracks[index]
                    relevant_items[playlist_id].append(removed_track)
                    tracks = np.delete(tracks, index)
            grouped[playlist_id] = tracks

        all_tracks = self.tracks_df["track_id"].unique()
        matrix = MultiLabelBinarizer(classes=all_tracks,
                                     sparse_output=True).fit_transform(grouped)
        self.URM_train = matrix.tocsr()
        self.URM_train = self.URM_train.astype(np.float64)
        self.dict_test = relevant_items
Example #8
0
currentDir = os.path.basename(os.getcwd())
currentFileName = os.path.basename(__file__)

libDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
sys.path.append(libDir)
print libDir

from lib.router import Router
router = Router()
ROUTES = router.getRoutes()

from lib.loader import Loader
loader = Loader(currentFullRoute, ROUTES)

from lib.helper import Helper
helper = Helper()

from lib.filewriter import FileWriter
filewriter = FileWriter()

from lib.reporter import Reporter
reporter = Reporter(ROUTES)

# return to current path

sys.path.append(currentFullRoute)

# ---------------------------------------------------------------------- CUSTOM LIBS

from math import *
from graphviz import Digraph
Example #9
0
from lib.helper import Helper
from lib.encoder import Encoder
import sys

if __name__ == "__main__":
    helper = Helper(sys.argv)
    helper.ParseConfig().ShowConfigDescription()

    encoder = Encoder(helper)
    encoder.append_vars(helper.GetConfig("vars"))

    template = helper.LoadFile(helper.GetConfig("template"))
    data = template

    for evasion in helper.GetConfig("evasion"):
        data += helper.LoadFile("templates/evasions/" + evasion + ".vba")

    data = encoder.replace_var(data, "offset", encoder.get_encoding_offset())
    data = encoder.encode_user_vars(data)
    data = encoder.append_def_use_tag(data)
    data = encoder.rand_vars(data)
    data = encoder.rand_int(data)
    data = encoder.rand_smallint(data)

    encodedvars = helper.GetConfig("encodedvars")
    for var in encodedvars:
        data = encoder.replace_var(data, var, encodedvars[var], True)

    if "-s" in sys.argv or "--split_strings" in sys.argv:
        data = encoder.split_strings(data)
    if "-x" in sys.argv or "--strings_to_hex" in sys.argv: