def make_candidate_dataset(self, shuffle=True):
        result = {
            "model_input": [],
            'input_size': [],
            'seen_songs_set': [],
            'seen_tags_set': [],
            'plylst_updt_date': [],
            'gt': [],
            'id_list': []
        }

        if shuffle:
            random.shuffle(self.dataset)

        for each in tqdm(self.dataset, total=len(self.dataset)):
            songs = list(
                filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(
                filter(lambda tag: tag in self.all_tags_set, each['tags']))
            artists = util.get_artists(songs, self.label_info.song_artist_dict)

            label = songs + tags + artists
            if not label:
                continue

            sampled_songs, sampled_tags = self.get_random_sampled_model_input(
                songs, tags)
            if not sampled_songs and not sampled_tags:
                continue
            sampled_artists = util.get_artists(
                sampled_songs, self.label_info.song_artist_dict)

            model_input = convert_model_input(sampled_songs, sampled_tags,
                                              sampled_artists)
            pad_model_input = self.label_info.label_encoder.transform(
                model_input + [parameters.pad_token] *
                (self.model_input_size - len(model_input)))

            plylst_id = each["id"]

            gt = {
                'songs': list(set(songs) - set(sampled_songs)),
                'tags': list(set(tags) - set(sampled_tags)),
                'id': plylst_id
            }

            if not gt['songs'] or not gt['tags']:
                continue

            result['gt'].append(gt)
            result['input_size'].append(len(model_input))
            result['id_list'].append(each["id"])
            result["model_input"].append(pad_model_input)
            result['seen_songs_set'].append(set(sampled_songs))
            result['seen_tags_set'].append(set(sampled_tags))
            result['plylst_updt_date'].append(
                util.convert_updt_date(each["updt_date"]))

        result['model_input'] = np.array(result['model_input'], dtype=np.int32)
        return result
    def make_loss_check_dataset(self, question):
        result = {"model_input": [], 'label': [], 'input_size': []}

        for each in tqdm(question, total=len(question)):
            songs = list(
                filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(
                filter(lambda tag: tag in self.all_tags_set, each['tags']))
            artists = util.get_artists(songs, self.label_info.song_artist_dict)
            if not songs and not tags:
                continue

            answer_songs = list(
                filter(
                    lambda song: song in self.all_songs_set,
                    self.answer_plylst_id_songs_tags_dict[
                        each['id']]['songs']))
            answer_tags = list(
                filter(
                    lambda tag: tag in self.all_tags_set,
                    self.answer_plylst_id_songs_tags_dict[each['id']]['tags']))
            answer_artists = util.get_artists(answer_songs,
                                              self.label_info.song_artist_dict)

            label = songs + answer_songs + tags + answer_tags + artists + answer_artists
            if not label:
                continue
            label = self.label_info.label_encoder.transform(label)

            model_input = convert_model_input(songs, tags, artists)
            pad_model_input = self.label_info.label_encoder.transform(
                model_input + [parameters.pad_token] *
                (self.model_input_size - len(model_input)))

            result['input_size'].append(len(model_input))
            result["model_input"].append(pad_model_input)
            result["label"].append(label)

        result['model_input'] = np.array(result['model_input'], dtype=np.int32)
        return result
    def make_dataset(self, shuffle=True):
        result = {"model_input": [], 'label': [], 'input_size': []}

        if shuffle:
            random.shuffle(self.dataset)

        for each in tqdm(self.dataset, total=len(self.dataset)):
            songs = list(
                filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(
                filter(lambda tag: tag in self.all_tags_set, each['tags']))
            artists = util.get_artists(songs, self.label_info.song_artist_dict)

            label = songs + tags + artists
            if not label:
                continue

            sampled_songs, sampled_tags = self.get_random_sampled_model_input(
                songs, tags)
            if not sampled_songs and not sampled_tags:
                continue
            sampled_artists = util.get_artists(
                sampled_songs, self.label_info.song_artist_dict)

            model_input = convert_model_input(sampled_songs, sampled_tags,
                                              sampled_artists)
            pad_model_input = self.label_info.label_encoder.transform(
                model_input + [parameters.pad_token] *
                (self.model_input_size - len(model_input)))
            label = self.label_info.label_encoder.transform(label)

            result['input_size'].append(len(model_input))
            result["model_input"].append(pad_model_input)
            result["label"].append(label)

        result['model_input'] = np.array(result['model_input'], dtype=np.int32)
        return result
    def make_ndcg_check_dataset(self, question):
        result = {
            'model_input': [],
            'id_list': [],
            'input_size': [],
            'seen_songs_set': [],
            'seen_tags_set': [],
            'plylst_updt_date': [],
            'gt': []
        }

        for each in tqdm(question, total=len(question)):
            songs = list(
                filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(
                filter(lambda tag: tag in self.all_tags_set, each['tags']))
            artists = util.get_artists(songs, self.label_info.song_artist_dict)

            if not songs and not tags and not artists:
                continue

            model_input = convert_model_input(songs, tags, artists)
            pad_model_input = self.label_info.label_encoder.transform(
                model_input + [parameters.pad_token] *
                (self.model_input_size - len(model_input)))

            gt = self.answer_plylst_id_songs_tags_dict[each["id"]]
            gt['id'] = each["id"]

            result['gt'].append(gt)
            result['model_input'].append(pad_model_input)
            result['input_size'].append(len(model_input))
            result['id_list'].append(each["id"])
            result['seen_songs_set'].append(set(songs))
            result['seen_tags_set'].append(set(tags))
            result['plylst_updt_date'].append(
                util.convert_updt_date(each["updt_date"]))

        result['model_input'] = np.array(result['model_input'], dtype=np.int32)
        return result
Example #5
0
import numpy as np
import sys
import util
import clique
import rec


# DATA FILES
f_user_artists = "data/user_artists.dat"
f_artists = "data/artists.dat"
f_friends = "data/user_friends.dat"


print 'loading artists in {}'.format(f_artists)
sys.stdout.flush()
artists = util.get_artists(f_artists)
a2i = util.convert_to_ind(artists)


print 'default ordering by popularity'
sys.stdout.flush()
artists_ordered = util.sort_dict_dec(util.artist_to_count(a2i, f_user_artists))


print 'loading all users in {}'.format(f_user_artists)
sys.stdout.flush()
users = np.array(util.get_users(f_user_artists))
# u2i = util.convert_to_ind(users)


print 'creating cross-validation splits'
    def do_reco(self, question_path, batch_size=128, title_importance=0.85, title_tag_weight=0.8):
        answers = []

        songs_tags_artists_data = {'model_input': [], 'plylst_id_list': []}
        plylst_title_data = {'model_input': [], 'plylst_id_list': []}
        coldstart_plylst_id_list = []

        plylst_id_songs_tags_num = {}

        question = util.load_json(question_path)
        for each in tqdm(question, total=len(question), desc='Preprocess'):
            songs = list(filter(lambda song: song in self.all_songs_set, each['songs']))
            tags = list(filter(lambda tag: tag in self.all_tags_set, each['tags']))
            artists = util.get_artists(songs, self.label_info.song_artist_dict)
            plylst_title = util.remove_special_char(each['plylst_title'])
            plylst_id = each['id']
            plylst_updt_date = each['updt_date']

            self.plylst_id_seen_songs_dict[plylst_id] = set(songs)
            self.plylst_id_seen_tags_dict[plylst_id] = set(tags)
            self.plylst_id_plylst_updt_date_dict[plylst_id] = util.convert_updt_date(plylst_updt_date)

            plylst_id_songs_tags_num[plylst_id] = len(songs + tags)
            if songs or tags:
                model_input = songs_tags_artists_util.convert_model_input(songs, tags, artists,
                                                                          self.label_info.label_encoder)
                model_input += [self.songs_tags_artists_model_pad_idx] * (
                        parameters.songs_tags_artists_model_max_sequence_length - len(model_input))
                songs_tags_artists_data['model_input'].append(model_input)
                songs_tags_artists_data['plylst_id_list'].append(plylst_id)

            if plylst_title:
                model_input = plylst_title_util.convert_model_input(plylst_title, self.sp,
                                                                    parameters.title_model_max_sequence_length)
                model_input += [self.plylst_title_model_pad_idx] * (
                        parameters.title_model_max_sequence_length - len(model_input))
                plylst_title_data['model_input'].append(model_input)
                plylst_title_data['plylst_id_list'].append(plylst_id)

            if not songs and not tags and not plylst_title:
                coldstart_plylst_id_list.append(plylst_id)

        total_plylst_id_reco_song_score_dict = {}
        total_plylst_id_reco_tag_score_dict = {}

        # do songs_tags_artists_model
        iter = int(np.ceil(len(songs_tags_artists_data['model_input']) / batch_size))
        for i in tqdm(range(iter), desc='songs_tags_artists_model'):
            plylst_id_reco_song_score_dict, plylst_id_reco_tag_score_dict = self.songs_tags_do_reco(
                self.songs_tags_artists_model,
                model_input=songs_tags_artists_data['model_input'][i * batch_size:(i + 1) * batch_size],
                plylst_id_list=songs_tags_artists_data['plylst_id_list'][i * batch_size:(i + 1) * batch_size])

            for plylst_id in plylst_id_reco_song_score_dict:
                if plylst_id not in total_plylst_id_reco_song_score_dict:
                    total_plylst_id_reco_song_score_dict[plylst_id] = {}
                for song, score in plylst_id_reco_song_score_dict[plylst_id].items():
                    if song not in total_plylst_id_reco_song_score_dict[plylst_id]:
                        total_plylst_id_reco_song_score_dict[plylst_id][song] = 0
                    total_plylst_id_reco_song_score_dict[plylst_id][song] += score * plylst_id_songs_tags_num[
                        plylst_id] / (title_importance + plylst_id_songs_tags_num[plylst_id])

            for plylst_id in plylst_id_reco_tag_score_dict:
                if plylst_id not in total_plylst_id_reco_tag_score_dict:
                    total_plylst_id_reco_tag_score_dict[plylst_id] = {}
                for tag, score in plylst_id_reco_tag_score_dict[plylst_id].items():
                    if tag not in total_plylst_id_reco_tag_score_dict[plylst_id]:
                        total_plylst_id_reco_tag_score_dict[plylst_id][tag] = 0
                    total_plylst_id_reco_tag_score_dict[plylst_id][tag] += score

        # do plylst_title_model
        iter = int(np.ceil(len(plylst_title_data['model_input']) / batch_size))
        for i in tqdm(range(iter), desc='plylst_title_model'):
            plylst_id_reco_song_score_dict, plylst_id_reco_tag_score_dict = self.songs_tags_do_reco(
                self.plylst_title_model,
                model_input=plylst_title_data['model_input'][i * batch_size:(i + 1) * batch_size],
                plylst_id_list=plylst_title_data['plylst_id_list'][i * batch_size:(i + 1) * batch_size])

            for plylst_id in plylst_id_reco_song_score_dict:
                if plylst_id not in total_plylst_id_reco_song_score_dict:
                    total_plylst_id_reco_song_score_dict[plylst_id] = {}
                for song, score in plylst_id_reco_song_score_dict[plylst_id].items():
                    if song not in total_plylst_id_reco_song_score_dict[plylst_id]:
                        total_plylst_id_reco_song_score_dict[plylst_id][song] = 0
                    total_plylst_id_reco_song_score_dict[plylst_id][
                        song] += score * title_importance / (title_importance + plylst_id_songs_tags_num[plylst_id])

            for plylst_id in plylst_id_reco_tag_score_dict:
                if plylst_id not in total_plylst_id_reco_tag_score_dict:
                    total_plylst_id_reco_tag_score_dict[plylst_id] = {}
                for tag, score in plylst_id_reco_tag_score_dict[plylst_id].items():
                    if tag not in total_plylst_id_reco_tag_score_dict[plylst_id]:
                        total_plylst_id_reco_tag_score_dict[plylst_id][tag] = 0
                    total_plylst_id_reco_tag_score_dict[plylst_id][
                        tag] += score * title_tag_weight

        # 두개 모델 종합해서 추천
        for plylst_id in total_plylst_id_reco_song_score_dict:
            reco_songs = list(map(lambda x: x[0], sorted(list(total_plylst_id_reco_song_score_dict[plylst_id].items()),
                                                         key=lambda x: x[1], reverse=True)[:100]))
            reco_tags = list(map(lambda x: x[0], sorted(list(total_plylst_id_reco_tag_score_dict[plylst_id].items()),
                                                        key=lambda x: x[1], reverse=True)[:10]))
            answers.append({
                "id": plylst_id,
                "songs": reco_songs,
                "tags": reco_tags,
            })

        # cold_start
        for plylst_id in tqdm(coldstart_plylst_id_list, total=len(coldstart_plylst_id_list), desc='coldstart_reco'):
            reco_songs, reco_tags = self.coldstart_do_reco(plylst_id)
            answers.append({
                "id": plylst_id,
                "songs": reco_songs,
                "tags": reco_tags,
            })

        return answers