def split_randomic_exactly_last(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) helper = Helper() self.target_playlists = helper.get_target_playlists_list()[5000:] selected_playlists = self.target_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_only_sequential(self, URM, URM_df): helper = Helper() sequential_playlists = helper.get_target_playlists_list()[:5000] selected_playlists = np.array([]) self.target_playlists = sequential_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in sequential_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) # Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def recommend(self, playlist_id, at=10): playlist_id = int(playlist_id) helper = Helper() ### DUE TO TIME CONSTRAINT THE CODE STRUCTURE HERE IS REDUNTANT ### TODO exploit inheritance to reduce code duplications and simple extract ratings, combine them, simply by iterate over a list of recommenders ### COMMON CODE ### self.hybrid_ratings = None #BE CAREFUL, MAGIC INSIDE :) ### COMBINE RATINGS IN DIFFERENT WAYS (seq, random short, random long) if (helper.is_sequential(playlist_id)): self.userCF_ratings = self.userCF_sequential.get_expected_ratings( playlist_id) self.itemCF_ratings = self.itemCF_sequential.get_expected_ratings( playlist_id) self.cbf_ratings = self.cbf_sequential.get_expected_ratings( playlist_id) self.slim_elastic_ratings = self.slim_elastic_sequential.get_expected_ratings( playlist_id) # self.svd_icm_ratings = self.svd_icm_sequential.get_expected_ratings(playlist_id) self.ALS_ratings = self.ALS_sequential.get_expected_ratings( playlist_id) self.slim_ratings = self.slim_sequential.get_expected_ratings( playlist_id) w_right = self.w_seq else: self.userCF_ratings = self.userCF.get_expected_ratings(playlist_id) self.itemCF_ratings = self.itemCF.get_expected_ratings(playlist_id) self.cbf_ratings = self.cbf.get_expected_ratings(playlist_id) self.slim_elastic_ratings = self.slim_elastic.get_expected_ratings( playlist_id) # self.svd_icm_ratings = self.svd_icm.get_expected_ratings(playlist_id) self.ALS_ratings = self.ALS.get_expected_ratings(playlist_id) self.slim_ratings = self.slim_random.get_expected_ratings( playlist_id) if len(self.URM[playlist_id].indices) > 10: w_right = self.w_long else: w_right = self.w_short self.hybrid_ratings = self.userCF_ratings * w_right["user_cf"] self.hybrid_ratings += self.itemCF_ratings * w_right["item_cf"] self.hybrid_ratings += self.cbf_ratings * w_right["cbf"] self.hybrid_ratings += self.slim_ratings * w_right["slim"] # self.hybrid_ratings += self.svd_icm_ratings * w_right["svd_icm"] self.hybrid_ratings += self.ALS_ratings * w_right["als"] self.hybrid_ratings += self.slim_elastic_ratings * w_right["elastic"] recommended_items = np.flip(np.argsort(self.hybrid_ratings), 0) # REMOVING SEEN unseen_items_mask = np.in1d(recommended_items, self.URM[playlist_id].indices, assume_unique=True, invert=True) recommended_items = recommended_items[unseen_items_mask] return recommended_items[0:at]
def split_cluster_randomic_only_last(self, URM, URM_df): # splitting URM in test set e train set segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() # Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list( )[:5000] # WILL REMOVE THEM print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key and playlist_id not in target_playlists: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_randomic_all_playlists_longer_10000(self, URM, URM_df, threshold_length=10): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) helper = Helper() target_playlists_kaggle = helper.get_target_playlists_list() for playlist_id in available_playlists: if len(selected_playlists) == 10000: break if playlist_id not in target_playlists_kaggle and len( URM[playlist_id].indices) > threshold_length: selected_playlists = np.append(selected_playlists, playlist_id) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_cluster_randomic(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() segment_size = 1 min_playlist_len_after_split = 5 dist = target_analyzer.get_distribution_array( segment_size=segment_size) # in this way n_target = 10000 helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] n_target = np.sum(dist) # - len(target_playlists) while n_target > 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) while dist[target_segment] <= 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) n_target -= 1 dist[target_segment] -= 1 selected_playlists = np.append(selected_playlists, playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) grouped_test = grouped.copy() relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) # if playlist_id in target_playlists: # to_be_removed = int(len(tracks) * 0.2) # to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(playlist_id)[-to_be_removed:] # for track in to_be_removed_tracks: # relevant_items[playlist_id].append(track) # tracks = np.delete(tracks, np.where(tracks == track)) # for i in range(to_be_removed): # removed_track = tracks[-1] # relevant_items[playlist_id].append(removed_track) # tracks = np.delete(tracks, len(tracks) - 1) # else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks grouped_test[playlist_id] = relevant_items[playlist_id] count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.dict_test = relevant_items # bib URM # self.URM_train = helper.get_urm_csr_bib(URM = self.URM_train) # plotter = TargetAnalyzer() # plotter.plot_standard_distribution() # plotter.plot_distribution(self.URM_train, self.target_playlists) self.URM_test = MultiLabelBinarizer( classes=all_tracks, sparse_output=True).fit_transform(grouped_test) self.URM_test = self.URM_test.tocsr() self.URM_test = self.URM_test.astype(np.float64) self.URM_train = self.URM_train.astype(np.float64)
def split_sequential(self, URM, URM_df): segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() #Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] #n_target = np.sum(dist) - len(target_playlists) # Removing from the cluster distribution the len of the sequential target for playlist_id in target_playlists: playlist_id = int(playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) #target_len = len(URM[playlist_id].data) #dist[target_len] -= 1 print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: #Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) if playlist_id in target_playlists: to_be_removed = int(len(tracks) * 0.2) #Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
currentDir = os.path.basename(os.getcwd()) currentFileName = os.path.basename(__file__) libDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../")) sys.path.append(libDir) print libDir from lib.router import Router router = Router() ROUTES = router.getRoutes() from lib.loader import Loader loader = Loader(currentFullRoute, ROUTES) from lib.helper import Helper helper = Helper() from lib.filewriter import FileWriter filewriter = FileWriter() from lib.reporter import Reporter reporter = Reporter(ROUTES) # return to current path sys.path.append(currentFullRoute) # ---------------------------------------------------------------------- CUSTOM LIBS from math import * from graphviz import Digraph
from lib.helper import Helper from lib.encoder import Encoder import sys if __name__ == "__main__": helper = Helper(sys.argv) helper.ParseConfig().ShowConfigDescription() encoder = Encoder(helper) encoder.append_vars(helper.GetConfig("vars")) template = helper.LoadFile(helper.GetConfig("template")) data = template for evasion in helper.GetConfig("evasion"): data += helper.LoadFile("templates/evasions/" + evasion + ".vba") data = encoder.replace_var(data, "offset", encoder.get_encoding_offset()) data = encoder.encode_user_vars(data) data = encoder.append_def_use_tag(data) data = encoder.rand_vars(data) data = encoder.rand_int(data) data = encoder.rand_smallint(data) encodedvars = helper.GetConfig("encodedvars") for var in encodedvars: data = encoder.replace_var(data, var, encodedvars[var], True) if "-s" in sys.argv or "--split_strings" in sys.argv: data = encoder.split_strings(data) if "-x" in sys.argv or "--strings_to_hex" in sys.argv: