def make_dataset(self, shuffle=True): result = {"model_input": [], 'label': []} pad_idx = self.sentencepiece.piece_to_id(parameters.pad_token) if shuffle: random.shuffle(self.dataset) for each in tqdm(self.dataset, total=len(self.dataset)): plylst_title = util.remove_special_char(each['plylst_title']) if not plylst_title: continue songs = list(filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags'])) label = songs + tags model_input = convert_model_input(plylst_title, self.sentencepiece, self.model_input_size) pad_model_input = model_input + [pad_idx] * (self.model_input_size - len(model_input)) label = self.label_info.label_encoder.transform(label) result["model_input"].append(pad_model_input) result["label"].append(label) return result
def make_ndcg_check_dataset(self, question): result = {'model_input': [], 'id_list': [], 'seen_songs_set': [], 'seen_tags_set': [], 'plylst_updt_date': [], 'gt': []} pad_idx = self.sentencepiece.piece_to_id(parameters.pad_token) for each in tqdm(question, total=len(question)): plylst_title = util.remove_special_char(each['plylst_title']) if not plylst_title: continue model_input = convert_model_input(plylst_title, self.sentencepiece, self.model_input_size) pad_model_input = model_input + [pad_idx] * (self.model_input_size - len(model_input)) gt = self.answer_plylst_id_songs_tags_dict[each["id"]] gt['id'] = each["id"] songs = list(filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags'])) result['gt'].append(gt) result['model_input'].append(pad_model_input) result['id_list'].append(each["id"]) result['seen_songs_set'].append(set(songs)) result['seen_tags_set'].append(set(tags)) result['plylst_updt_date'].append(util.convert_updt_date(each["updt_date"])) return result
def make_loss_check_dataset(self, question): dataset = {"model_input": [], 'label': []} pad_idx = self.sentencepiece.piece_to_id(parameters.pad_token) for each in tqdm(question, total=len(question)): plylst_title = util.remove_special_char(each['plylst_title']) if not plylst_title: continue songs = list(filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags'])) answer_songs = list(filter(lambda song: song in self.all_songs_set, self.answer_plylst_id_songs_tags_dict[each['id']]['songs'])) answer_tags = list(filter(lambda tag: tag in self.all_tags_set, self.answer_plylst_id_songs_tags_dict[each['id']]['tags'])) label = songs + answer_songs + tags + answer_tags if not label: continue label = self.label_info.label_encoder.transform(label) model_input = convert_model_input(plylst_title, self.sentencepiece, self.model_input_size) pad_model_input = model_input + [pad_idx] * (self.model_input_size - len(model_input)) dataset["model_input"].append(pad_model_input) dataset["label"].append(label) return dataset
def dump_plylst_title(dataset, fout): with open(fout, 'w', encoding='utf-8', errors='ignore') as o: for each in dataset: plylst_title = util.remove_special_char(each['plylst_title']) if not plylst_title: continue o.write(plylst_title + '\n')
def make_pre_train_dataset(self, shuffle=True): result = {"model_input": [], 'mask_label': [], 'boolean_mask': []} pad_idx = self.sentencepiece.piece_to_id(parameters.pad_token) if shuffle: random.shuffle(self.dataset) for each in tqdm(self.dataset, total=len(self.dataset)): plylst_title = util.remove_special_char(each['plylst_title']) if not plylst_title: continue model_input = convert_model_input(plylst_title, self.sentencepiece, self.model_input_size) model_input, mask_label, boolean_mask = make_mask_dataset(model_input, self.sentencepiece) if not model_input: continue pad_model_input = model_input + [pad_idx] * (self.model_input_size - len(model_input)) pad_boolean_mask = boolean_mask + [False] * (self.model_input_size - len(boolean_mask)) result["model_input"].append(pad_model_input) result["mask_label"].append(mask_label) result["boolean_mask"].append(pad_boolean_mask) return result
def do_reco(self, question_path, batch_size=128, title_importance=0.85, title_tag_weight=0.8): answers = [] songs_tags_artists_data = {'model_input': [], 'plylst_id_list': []} plylst_title_data = {'model_input': [], 'plylst_id_list': []} coldstart_plylst_id_list = [] plylst_id_songs_tags_num = {} question = util.load_json(question_path) for each in tqdm(question, total=len(question), desc='Preprocess'): songs = list(filter(lambda song: song in self.all_songs_set, each['songs'])) tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags'])) artists = util.get_artists(songs, self.label_info.song_artist_dict) plylst_title = util.remove_special_char(each['plylst_title']) plylst_id = each['id'] plylst_updt_date = each['updt_date'] self.plylst_id_seen_songs_dict[plylst_id] = set(songs) self.plylst_id_seen_tags_dict[plylst_id] = set(tags) self.plylst_id_plylst_updt_date_dict[plylst_id] = util.convert_updt_date(plylst_updt_date) plylst_id_songs_tags_num[plylst_id] = len(songs + tags) if songs or tags: model_input = songs_tags_artists_util.convert_model_input(songs, tags, artists, self.label_info.label_encoder) model_input += [self.songs_tags_artists_model_pad_idx] * ( parameters.songs_tags_artists_model_max_sequence_length - len(model_input)) songs_tags_artists_data['model_input'].append(model_input) songs_tags_artists_data['plylst_id_list'].append(plylst_id) if plylst_title: model_input = plylst_title_util.convert_model_input(plylst_title, self.sp, parameters.title_model_max_sequence_length) model_input += [self.plylst_title_model_pad_idx] * ( parameters.title_model_max_sequence_length - len(model_input)) plylst_title_data['model_input'].append(model_input) plylst_title_data['plylst_id_list'].append(plylst_id) if not songs and not tags and not plylst_title: coldstart_plylst_id_list.append(plylst_id) total_plylst_id_reco_song_score_dict = {} total_plylst_id_reco_tag_score_dict = {} # do songs_tags_artists_model iter = int(np.ceil(len(songs_tags_artists_data['model_input']) / batch_size)) for i in tqdm(range(iter), desc='songs_tags_artists_model'): plylst_id_reco_song_score_dict, plylst_id_reco_tag_score_dict = self.songs_tags_do_reco( self.songs_tags_artists_model, model_input=songs_tags_artists_data['model_input'][i * batch_size:(i + 1) * batch_size], plylst_id_list=songs_tags_artists_data['plylst_id_list'][i * batch_size:(i + 1) * batch_size]) for plylst_id in plylst_id_reco_song_score_dict: if plylst_id not in total_plylst_id_reco_song_score_dict: total_plylst_id_reco_song_score_dict[plylst_id] = {} for song, score in plylst_id_reco_song_score_dict[plylst_id].items(): if song not in total_plylst_id_reco_song_score_dict[plylst_id]: total_plylst_id_reco_song_score_dict[plylst_id][song] = 0 total_plylst_id_reco_song_score_dict[plylst_id][song] += score * plylst_id_songs_tags_num[ plylst_id] / (title_importance + plylst_id_songs_tags_num[plylst_id]) for plylst_id in plylst_id_reco_tag_score_dict: if plylst_id not in total_plylst_id_reco_tag_score_dict: total_plylst_id_reco_tag_score_dict[plylst_id] = {} for tag, score in plylst_id_reco_tag_score_dict[plylst_id].items(): if tag not in total_plylst_id_reco_tag_score_dict[plylst_id]: total_plylst_id_reco_tag_score_dict[plylst_id][tag] = 0 total_plylst_id_reco_tag_score_dict[plylst_id][tag] += score # do plylst_title_model iter = int(np.ceil(len(plylst_title_data['model_input']) / batch_size)) for i in tqdm(range(iter), desc='plylst_title_model'): plylst_id_reco_song_score_dict, plylst_id_reco_tag_score_dict = self.songs_tags_do_reco( self.plylst_title_model, model_input=plylst_title_data['model_input'][i * batch_size:(i + 1) * batch_size], plylst_id_list=plylst_title_data['plylst_id_list'][i * batch_size:(i + 1) * batch_size]) for plylst_id in plylst_id_reco_song_score_dict: if plylst_id not in total_plylst_id_reco_song_score_dict: total_plylst_id_reco_song_score_dict[plylst_id] = {} for song, score in plylst_id_reco_song_score_dict[plylst_id].items(): if song not in total_plylst_id_reco_song_score_dict[plylst_id]: total_plylst_id_reco_song_score_dict[plylst_id][song] = 0 total_plylst_id_reco_song_score_dict[plylst_id][ song] += score * title_importance / (title_importance + plylst_id_songs_tags_num[plylst_id]) for plylst_id in plylst_id_reco_tag_score_dict: if plylst_id not in total_plylst_id_reco_tag_score_dict: total_plylst_id_reco_tag_score_dict[plylst_id] = {} for tag, score in plylst_id_reco_tag_score_dict[plylst_id].items(): if tag not in total_plylst_id_reco_tag_score_dict[plylst_id]: total_plylst_id_reco_tag_score_dict[plylst_id][tag] = 0 total_plylst_id_reco_tag_score_dict[plylst_id][ tag] += score * title_tag_weight # 두개 모델 종합해서 추천 for plylst_id in total_plylst_id_reco_song_score_dict: reco_songs = list(map(lambda x: x[0], sorted(list(total_plylst_id_reco_song_score_dict[plylst_id].items()), key=lambda x: x[1], reverse=True)[:100])) reco_tags = list(map(lambda x: x[0], sorted(list(total_plylst_id_reco_tag_score_dict[plylst_id].items()), key=lambda x: x[1], reverse=True)[:10])) answers.append({ "id": plylst_id, "songs": reco_songs, "tags": reco_tags, }) # cold_start for plylst_id in tqdm(coldstart_plylst_id_list, total=len(coldstart_plylst_id_list), desc='coldstart_reco'): reco_songs, reco_tags = self.coldstart_do_reco(plylst_id) answers.append({ "id": plylst_id, "songs": reco_songs, "tags": reco_tags, }) return answers