def make_candidate_dataset(self, shuffle=True): result = { "model_input": [], 'input_size': [], 'seen_songs_set': [], 'seen_tags_set': [], 'plylst_updt_date': [], 'gt': [], 'id_list': [] } if shuffle: random.shuffle(self.dataset) for each in tqdm(self.dataset, total=len(self.dataset)): songs = list( filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list( filter(lambda tag: tag in self.all_tags_set, each['tags'])) artists = util.get_artists(songs, self.label_info.song_artist_dict) label = songs + tags + artists if not label: continue sampled_songs, sampled_tags = self.get_random_sampled_model_input( songs, tags) if not sampled_songs and not sampled_tags: continue sampled_artists = util.get_artists( sampled_songs, self.label_info.song_artist_dict) model_input = convert_model_input(sampled_songs, sampled_tags, sampled_artists) pad_model_input = self.label_info.label_encoder.transform( model_input + [parameters.pad_token] * (self.model_input_size - len(model_input))) plylst_id = each["id"] gt = { 'songs': list(set(songs) - set(sampled_songs)), 'tags': list(set(tags) - set(sampled_tags)), 'id': plylst_id } if not gt['songs'] or not gt['tags']: continue result['gt'].append(gt) result['input_size'].append(len(model_input)) result['id_list'].append(each["id"]) result["model_input"].append(pad_model_input) result['seen_songs_set'].append(set(sampled_songs)) result['seen_tags_set'].append(set(sampled_tags)) result['plylst_updt_date'].append( util.convert_updt_date(each["updt_date"])) result['model_input'] = np.array(result['model_input'], dtype=np.int32) return result
def make_loss_check_dataset(self, question): result = {"model_input": [], 'label': [], 'input_size': []} for each in tqdm(question, total=len(question)): songs = list( filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list( filter(lambda tag: tag in self.all_tags_set, each['tags'])) artists = util.get_artists(songs, self.label_info.song_artist_dict) if not songs and not tags: continue answer_songs = list( filter( lambda song: song in self.all_songs_set, self.answer_plylst_id_songs_tags_dict[ each['id']]['songs'])) answer_tags = list( filter( lambda tag: tag in self.all_tags_set, self.answer_plylst_id_songs_tags_dict[each['id']]['tags'])) answer_artists = util.get_artists(answer_songs, self.label_info.song_artist_dict) label = songs + answer_songs + tags + answer_tags + artists + answer_artists if not label: continue label = self.label_info.label_encoder.transform(label) model_input = convert_model_input(songs, tags, artists) pad_model_input = self.label_info.label_encoder.transform( model_input + [parameters.pad_token] * (self.model_input_size - len(model_input))) result['input_size'].append(len(model_input)) result["model_input"].append(pad_model_input) result["label"].append(label) result['model_input'] = np.array(result['model_input'], dtype=np.int32) return result
def make_dataset(self, shuffle=True): result = {"model_input": [], 'label': [], 'input_size': []} if shuffle: random.shuffle(self.dataset) for each in tqdm(self.dataset, total=len(self.dataset)): songs = list( filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list( filter(lambda tag: tag in self.all_tags_set, each['tags'])) artists = util.get_artists(songs, self.label_info.song_artist_dict) label = songs + tags + artists if not label: continue sampled_songs, sampled_tags = self.get_random_sampled_model_input( songs, tags) if not sampled_songs and not sampled_tags: continue sampled_artists = util.get_artists( sampled_songs, self.label_info.song_artist_dict) model_input = convert_model_input(sampled_songs, sampled_tags, sampled_artists) pad_model_input = self.label_info.label_encoder.transform( model_input + [parameters.pad_token] * (self.model_input_size - len(model_input))) label = self.label_info.label_encoder.transform(label) result['input_size'].append(len(model_input)) result["model_input"].append(pad_model_input) result["label"].append(label) result['model_input'] = np.array(result['model_input'], dtype=np.int32) return result
def make_ndcg_check_dataset(self, question): result = { 'model_input': [], 'id_list': [], 'input_size': [], 'seen_songs_set': [], 'seen_tags_set': [], 'plylst_updt_date': [], 'gt': [] } for each in tqdm(question, total=len(question)): songs = list( filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list( filter(lambda tag: tag in self.all_tags_set, each['tags'])) artists = util.get_artists(songs, self.label_info.song_artist_dict) if not songs and not tags and not artists: continue model_input = convert_model_input(songs, tags, artists) pad_model_input = self.label_info.label_encoder.transform( model_input + [parameters.pad_token] * (self.model_input_size - len(model_input))) gt = self.answer_plylst_id_songs_tags_dict[each["id"]] gt['id'] = each["id"] result['gt'].append(gt) result['model_input'].append(pad_model_input) result['input_size'].append(len(model_input)) result['id_list'].append(each["id"]) result['seen_songs_set'].append(set(songs)) result['seen_tags_set'].append(set(tags)) result['plylst_updt_date'].append( util.convert_updt_date(each["updt_date"])) result['model_input'] = np.array(result['model_input'], dtype=np.int32) return result
import numpy as np import sys import util import clique import rec # DATA FILES f_user_artists = "data/user_artists.dat" f_artists = "data/artists.dat" f_friends = "data/user_friends.dat" print 'loading artists in {}'.format(f_artists) sys.stdout.flush() artists = util.get_artists(f_artists) a2i = util.convert_to_ind(artists) print 'default ordering by popularity' sys.stdout.flush() artists_ordered = util.sort_dict_dec(util.artist_to_count(a2i, f_user_artists)) print 'loading all users in {}'.format(f_user_artists) sys.stdout.flush() users = np.array(util.get_users(f_user_artists)) # u2i = util.convert_to_ind(users) print 'creating cross-validation splits'
def do_reco(self, question_path, batch_size=128, title_importance=0.85, title_tag_weight=0.8): answers = [] songs_tags_artists_data = {'model_input': [], 'plylst_id_list': []} plylst_title_data = {'model_input': [], 'plylst_id_list': []} coldstart_plylst_id_list = [] plylst_id_songs_tags_num = {} question = util.load_json(question_path) for each in tqdm(question, total=len(question), desc='Preprocess'): songs = list(filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags'])) artists = util.get_artists(songs, self.label_info.song_artist_dict) plylst_title = util.remove_special_char(each['plylst_title']) plylst_id = each['id'] plylst_updt_date = each['updt_date'] self.plylst_id_seen_songs_dict[plylst_id] = set(songs) self.plylst_id_seen_tags_dict[plylst_id] = set(tags) self.plylst_id_plylst_updt_date_dict[plylst_id] = util.convert_updt_date(plylst_updt_date) plylst_id_songs_tags_num[plylst_id] = len(songs + tags) if songs or tags: model_input = songs_tags_artists_util.convert_model_input(songs, tags, artists, self.label_info.label_encoder) model_input += [self.songs_tags_artists_model_pad_idx] * ( parameters.songs_tags_artists_model_max_sequence_length - len(model_input)) songs_tags_artists_data['model_input'].append(model_input) songs_tags_artists_data['plylst_id_list'].append(plylst_id) if plylst_title: model_input = plylst_title_util.convert_model_input(plylst_title, self.sp, parameters.title_model_max_sequence_length) model_input += [self.plylst_title_model_pad_idx] * ( parameters.title_model_max_sequence_length - len(model_input)) plylst_title_data['model_input'].append(model_input) plylst_title_data['plylst_id_list'].append(plylst_id) if not songs and not tags and not plylst_title: coldstart_plylst_id_list.append(plylst_id) total_plylst_id_reco_song_score_dict = {} total_plylst_id_reco_tag_score_dict = {} # do songs_tags_artists_model iter = int(np.ceil(len(songs_tags_artists_data['model_input']) / batch_size)) for i in tqdm(range(iter), desc='songs_tags_artists_model'): plylst_id_reco_song_score_dict, plylst_id_reco_tag_score_dict = self.songs_tags_do_reco( self.songs_tags_artists_model, model_input=songs_tags_artists_data['model_input'][i * batch_size:(i + 1) * batch_size], plylst_id_list=songs_tags_artists_data['plylst_id_list'][i * batch_size:(i + 1) * batch_size]) for plylst_id in plylst_id_reco_song_score_dict: if plylst_id not in total_plylst_id_reco_song_score_dict: total_plylst_id_reco_song_score_dict[plylst_id] = {} for song, score in plylst_id_reco_song_score_dict[plylst_id].items(): if song not in total_plylst_id_reco_song_score_dict[plylst_id]: total_plylst_id_reco_song_score_dict[plylst_id][song] = 0 total_plylst_id_reco_song_score_dict[plylst_id][song] += score * plylst_id_songs_tags_num[ plylst_id] / (title_importance + plylst_id_songs_tags_num[plylst_id]) for plylst_id in plylst_id_reco_tag_score_dict: if plylst_id not in total_plylst_id_reco_tag_score_dict: total_plylst_id_reco_tag_score_dict[plylst_id] = {} for tag, score in plylst_id_reco_tag_score_dict[plylst_id].items(): if tag not in total_plylst_id_reco_tag_score_dict[plylst_id]: total_plylst_id_reco_tag_score_dict[plylst_id][tag] = 0 total_plylst_id_reco_tag_score_dict[plylst_id][tag] += score # do plylst_title_model iter = int(np.ceil(len(plylst_title_data['model_input']) / batch_size)) for i in tqdm(range(iter), desc='plylst_title_model'): plylst_id_reco_song_score_dict, plylst_id_reco_tag_score_dict = self.songs_tags_do_reco( self.plylst_title_model, model_input=plylst_title_data['model_input'][i * batch_size:(i + 1) * batch_size], plylst_id_list=plylst_title_data['plylst_id_list'][i * batch_size:(i + 1) * batch_size]) for plylst_id in plylst_id_reco_song_score_dict: if plylst_id not in total_plylst_id_reco_song_score_dict: total_plylst_id_reco_song_score_dict[plylst_id] = {} for song, score in plylst_id_reco_song_score_dict[plylst_id].items(): if song not in total_plylst_id_reco_song_score_dict[plylst_id]: total_plylst_id_reco_song_score_dict[plylst_id][song] = 0 total_plylst_id_reco_song_score_dict[plylst_id][ song] += score * title_importance / (title_importance + plylst_id_songs_tags_num[plylst_id]) for plylst_id in plylst_id_reco_tag_score_dict: if plylst_id not in total_plylst_id_reco_tag_score_dict: total_plylst_id_reco_tag_score_dict[plylst_id] = {} for tag, score in plylst_id_reco_tag_score_dict[plylst_id].items(): if tag not in total_plylst_id_reco_tag_score_dict[plylst_id]: total_plylst_id_reco_tag_score_dict[plylst_id][tag] = 0 total_plylst_id_reco_tag_score_dict[plylst_id][ tag] += score * title_tag_weight # 두개 모델 종합해서 추천 for plylst_id in total_plylst_id_reco_song_score_dict: reco_songs = list(map(lambda x: x[0], sorted(list(total_plylst_id_reco_song_score_dict[plylst_id].items()), key=lambda x: x[1], reverse=True)[:100])) reco_tags = list(map(lambda x: x[0], sorted(list(total_plylst_id_reco_tag_score_dict[plylst_id].items()), key=lambda x: x[1], reverse=True)[:10])) answers.append({ "id": plylst_id, "songs": reco_songs, "tags": reco_tags, }) # cold_start for plylst_id in tqdm(coldstart_plylst_id_list, total=len(coldstart_plylst_id_list), desc='coldstart_reco'): reco_songs, reco_tags = self.coldstart_do_reco(plylst_id) answers.append({ "id": plylst_id, "songs": reco_songs, "tags": reco_tags, }) return answers