Esempio n. 1
0
 def fit_title2rec(self, titles, ID):
     keys = [i + " " + t for t, i in zip(titles, ID)]
     print('Fit title2rec...')
     vectors = list(map(self.fasttext.wv.get_vector, titles))
     self.t2r = WordEmbeddingsKeyedVectors(vector_size=100)
     self.t2r.add(keys, vectors)
     print('done.')
Esempio n. 2
0
    def song_based(self, mode='s2v', by='mean', keyedvector=True):
        if mode == 's2v':
            if not self.s2v:
                print("Song2Vec not exist.\nRun make_s2v first.")
                return
        elif mode == 'd2v':
            if not self.d2v:
                print("Doc2Vec not exist.\nRun make_d2v first.")
                return
        else:
            print("mode gets 's2v' or 'd2v'")

        if not by in ['mean', 'sum']:
            raise RuntimeError("'by' gets 'mean' or 'sum'")

        ply_id = []
        ply_vec = []

        for p in tqdm(self.data):
            if by == 'mean':
                tmp = []
            else:
                tmp = 0
            for song in p['songs']:
                try:
                    if by == 'mean':
                        if mode == 's2v':
                            tmp.append(self.s2v.wv.get_vector(str(song)))
                        else:
                            tmp.append(self.d2v.wv.get_vector(str(song)))
                    else:
                        if mode == 's2v':
                            tmp += self.s2v.wv.get_vector(str(song))
                        else:
                            tmp += self.d2v.wv.get_vector(str(song))
                except KeyError:
                    pass
            if by == 'mean':
                if tmp != []:
                    ply_id.append('(' + str(p['id']) + ') ' +
                                  p['plylst_title'])
                    ply_vec.append(np.mean(tmp, axis=0))
            else:
                if type(tmp) != int:
                    ply_id.append('(' + str(p['id']) + ') ' +
                                  p['plylst_title'])
                    ply_vec.append(tmp)

        print("Original data length: ", len(self.data))
        print("Embedded data length: ", len(ply_id))

        if not keyedvector:
            return ply_id, ply_vec

        out = WordEmbeddingsKeyedVectors(vector_size=100)
        out.add(ply_id, ply_vec)

        return out
Esempio n. 3
0
    def __init__(self, FILE_PATH):
        self.FILE_PATH = FILE_PATH

        # word2vec의 요소들
        # 최소 1번 이상 연관이 있어야 학습한다.
        self.min_count = 2
        # 의미를 담을 벡터를 150차원으로 만든다.
        self.size = 150
        # 중심단어 기준으로 앞뒤로 210개 범위까지 학습시킨다.
        self.window = 210
        # sg = 1이면 skip-gram 아니면 CBOW
        self.sg = 1

        # 키 + 벡터를 저장함
        # KeyedVectors는 추가 교육을 지원하지 않는 대신 더 작고 RAM을 덜 차지한다.
        self.p2v_model = WordEmbeddingsKeyedVectors(self.size)

        # 유니코드 한글 시작: 44032, 끝:55199
        self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG = 44032, 588, 28

        # 초성 리스트0~18
        self.CHOSUNG_LIST = [
            'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ',
            'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
        ]

        # 중성 리스트 0~20
        self.JUNGSUNG_LIST = [
            'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ',
            'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
        ]

        # 종성 리스트 0~27
        self.JONGSUNG_LIST = [
            '', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ',
            'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ',
            'ㅍ', 'ㅎ'
        ]

        # 여기에 자모분리된 train의 플레이리스트 제목들이 담긴다.
        self.title_list_detach = []

        # FILE_PATH가 가리키는 곳에 train, test와 most_popular_res.json, song_meta.json이 있어야 합니다.
        with open(FILE_PATH + '/train.json', encoding="utf-8") as f:
            self.train = json.load(f)
            self.train = random.sample(self.train, 30000)
        with open(FILE_PATH + '/val2.json', encoding="utf-8") as f:
            self.val = json.load(f)
        with open(FILE_PATH + '/most_popular_res.json', encoding="utf-8") as f:
            self.most_results = json.load(f)
        # song_meta 데이터를 가져온다.
        with open(FILE_PATH + '/song_meta.json', encoding="utf-8") as f:
            self.song_meta = json.load(f)
Esempio n. 4
0
def train(args):
    # Output during training
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # use text8 corpus as training data, haikus dont provide sufficient context
    training_data = api.load('text8')

    # use the phrase model to recognize bigrams like "White House" or "Climate Change"
    bigram_model = Phrases(training_data)
    # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
    bigrams = Phraser(bigram_model)

    # # create and train model
    model = Word2Vec(bigrams[training_data], size=args.embedding_dim)

    word_list = list(model.wv.vocab.keys())
    vector_list = [model[word] for word in word_list]

    # the basic model doesnt seem to be supporting item assignment
    # but WordEmbeddingsKeyedVectors does
    kv = WordEmbeddingsKeyedVectors(args.embedding_dim)
    kv.add(word_list, vector_list)

    kv.add(["<eos>", "<n>", "<unk>"], np.random.rand(3, args.embedding_dim))

    # just to be safe, clear the cache of normalized vectors
    # as i had a similar issue as https://github.com/RaRe-Technologies/gensim/issues/2532
    del kv.vectors_norm

    # save the new models
    bigrams.save(f"{args.model_path}/bigram.model")
    kv.save(f"{args.model_path}/word2vec.model")
Esempio n. 5
0
    def to_keyed_vectors(self, embd_matrix, dim, delete_unknown=True):
        """
        Transform to gensim's keyed vectors structure for further usage.
        https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/keyedvectors.py
        """
        vectors = WordEmbeddingsKeyedVectors(vector_size=dim)
        tokens = self.corpus.vocab.tokens
        if delete_unknown:
            # delete last row (for <UNK> token)
            embd_matrix = np.delete(embd_matrix, (-1), axis=0)
        else:
            # the last token is the UNK token so append it
            tokens.append("<UNK>")

        vectors.add(tokens, embd_matrix)
        return vectors
Esempio n. 6
0
def item2vec(dataset,min_count=3,size=300,sg=5):
    window = max(list(map(len,dataset)))
    p2v_model = WordEmbeddingsKeyedVectors(size)

    w2v_model = Word2Vec(dataset, min_count = min_count, size = size, window = window, sg = sg,seed=1025)
    w2v_model.save('item2vec.model')
    return w2v_model
Esempio n. 7
0
def compute_similar_nn(early, later):
    M_early = WordEmbeddingsKeyedVectors(300)
    M_early.add(words, early.to_numpy())
    M_later = WordEmbeddingsKeyedVectors(300)
    M_later.add(words, later.to_numpy())

    scores = list()
    for word in words:
        early_similar = get_similar_set(word, M_early)
        later_similar = get_similar_set(word, M_later)
        count = len(early_similar.intersection(later_similar))
        scores.append(count)
    return scores
Esempio n. 8
0
def getRecos(liked, disliked, watched, threshold=1000):

    liked = [str(l) for l in liked]
    disliked = [str(l) for l in disliked]

    watched = [str(w) for w in watched if str(
        w) not in liked and str(w) not in disliked]

    df_restr = df_movies[~df_movies["movieId"].isin(watched)].sort_values(
        by="count", ascending=False)

    kv = WordEmbeddingsKeyedVectors(movie_embedding_size)
    kv.add(
        df_restr['movieId'].apply(str).values,
        w[df_restr.movieId]
    )

    idlist = [int(x[0])
              for x in kv.most_similar(positive=liked, negative=disliked, restrict_vocab=4000, topn=12)]

    return getAll(idlist=idlist)
Esempio n. 9
0
def compute_scaled_average_sim(early, later):
    M_early = WordEmbeddingsKeyedVectors(300)
    M_early.add(words, early.to_numpy())
    M_later = WordEmbeddingsKeyedVectors(300)
    M_later.add(words, later.to_numpy())

    scores = list()
    for word in words:
        score = 0
        early_values = M_early.most_similar(word, topn=20)
        later_values = M_later.most_similar(word, topn=20)
        early_dict = {v[0]:v[1] for v in early_values}
        later_dict = {v[0]:v[1] for v in later_values}
        overlap = set([w for w in early_dict.keys() if w in later_dict])
        early_avg = 0
        later_avg = 0
        for entry in overlap:
            early_avg += early_dict[entry]
            later_avg += later_dict[entry]
        early_avg = early_avg / len(overlap) if len(overlap) else 0
        later_avg = later_avg / len(overlap) if len(overlap) else 0
        scores.append(len(overlap) + (1 - abs(later_avg - early_avg)))
    return scores
Esempio n. 10
0
def make_prediction(favofite_movie):
    """
    Input:
    feature_dict: a dictionary of the form {"feature_name": "value"}
    Function makes sure the features are fed to the model in the same order the
    model expects them.
    Output:
    Returns (x_inputs, probs) where
      x_inputs: a list of feature values in the order they appear in the model
      probs: a list of dictionaries with keys 'name', 'prob'
    """

    movie = favofite_movie

    threshold = 100
    mainstream_movies = movies_df[
        movies_df.n_ratings >= threshold].reset_index(drop=True)

    movie_embedding_size = w.shape[1]
    kv = WordEmbeddingsKeyedVectors(movie_embedding_size)
    kv.add(mainstream_movies['key'].values, w[mainstream_movies.movieId])

    results = kv.most_similar(movie)
    return [result[0] for result in results]
Esempio n. 11
0
def createKVs(DFFile, COFIle, type):
    #createWordandVectorList() # to create  word and vector list for wiki 50

    # wordList - list of words
    # vectorList - list of the vector corresponding to the words

    wordListW2V, vectorListW2V = loadWordANdVectorsW2V()
    wordListPCA, vectorListPCA = loadWordANdVectorsPCA(DFFile, COFIle)

    w2v_len = 50
    PCA_len = 10
    kv1 = WordEmbeddingsKeyedVectors(w2v_len)
    kv2 = WordEmbeddingsKeyedVectors(PCA_len)

    kv1.add(wordListW2V, list(vectorListW2V))
    kv2.add(wordListPCA, vectorListPCA)
    filename = 'KV' + type + '.obj'
    with open(filename, "wb") as f:
        pickle.dump(kv1, f)
        pickle.dump(kv2, f)
    print(kv1.most_similar('love'))  # gives the list of words similar to word1
    return filename
def get_p2v_model(train, val, w2v_model):
    p2v_model = WordEmbeddingsKeyedVectors(100)
    ID = []
    vec = []
    data = pd.concat([train, val], axis=0)
    for id_, songs, tags in zip(data['id'], data['songs'], data['tags']):
        tmp_vec = 0
        for token in songs + tags:
            try:
                tmp_vec += w2v_model.wv.get_vector(str(token))
            except KeyError:
                pass
        if type(tmp_vec) != int:
            ID.append(str(id_))
            vec.append(tmp_vec)
    p2v_model.add(ID, vec)

    file_name = "./manual_emb/p2v_mdl_" + get_time() + ".model"
    p2v_model.save(file_name)

    return p2v_model
Esempio n. 13
0
def main():
    args = parse_args()
    if not args.input.is_file():
        raise FileNotFoundError('%r is not a file' % args.input)
    if not args.outputdir.is_dir():
        raise FileNotFoundError('%r is not a directory' % args.outputdir)
    if args.outputname:
        outfile = args.outputdir / args.outputname
    else:
        name = args.input.stem + '-small.pkl'
        outfile = args.outputdir / name

    kv = load_fasttext_embeddings(args.input)

    vector_size = kv.vector_size
    token_iter = chain.from_iterable(iterate_tokens(s) for s in args.splits)
    words = list(set(token_iter)) + ["__UNK__", "__PAD__"]
    embeddings = np.zeros((len(words), vector_size))
    for row, word in tqdm.tqdm(enumerate(words)):
        embeddings[row, :] = kv.word_vec(word)

    new_kv = WordEmbeddingsKeyedVectors(vector_size)
    new_kv.add(words, embeddings)
    new_kv.save(str(outfile))
Esempio n. 14
0
class PlaylistEmbedding:
    # java의 생성자 같은 존재 __init__
    def __init__(self, FILE_PATH):
        self.FILE_PATH = FILE_PATH

        # word2vec의 요소들
        # 최소 1번 이상 연관이 있어야 학습한다.
        self.min_count = 2
        # 의미를 담을 벡터를 150차원으로 만든다.
        self.size = 150
        # 중심단어 기준으로 앞뒤로 210개 범위까지 학습시킨다.
        self.window = 210
        # sg = 1이면 skip-gram 아니면 CBOW
        self.sg = 1

        # 키 + 벡터를 저장함
        # KeyedVectors는 추가 교육을 지원하지 않는 대신 더 작고 RAM을 덜 차지한다.
        self.p2v_model = WordEmbeddingsKeyedVectors(self.size)

        # 유니코드 한글 시작: 44032, 끝:55199
        self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG = 44032, 588, 28

        # 초성 리스트0~18
        self.CHOSUNG_LIST = [
            'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ',
            'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
        ]

        # 중성 리스트 0~20
        self.JUNGSUNG_LIST = [
            'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ',
            'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
        ]

        # 종성 리스트 0~27
        self.JONGSUNG_LIST = [
            '', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ',
            'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ',
            'ㅍ', 'ㅎ'
        ]

        # 여기에 자모분리된 train의 플레이리스트 제목들이 담긴다.
        self.title_list_detach = []

        # FILE_PATH가 가리키는 곳에 train, test와 most_popular_res.json, song_meta.json이 있어야 합니다.
        with open(FILE_PATH + '/train.json', encoding="utf-8") as f:
            self.train = json.load(f)
            self.train = random.sample(self.train, 30000)
        with open(FILE_PATH + '/val2.json', encoding="utf-8") as f:
            self.val = json.load(f)
        with open(FILE_PATH + '/most_popular_res.json', encoding="utf-8") as f:
            self.most_results = json.load(f)
        # song_meta 데이터를 가져온다.
        with open(FILE_PATH + '/song_meta.json', encoding="utf-8") as f:
            self.song_meta = json.load(f)

    def write_json(self, data, fname):
        def _conv(o):
            if isinstance(o, (np.int64, np.int32)):
                return int(o)
            raise TypeError

        parent = os.path.dirname(fname)
        distutils.dir_util.mkpath(
            "C:/Users/hwang in beom/Desktop/final/full/" + parent)
        with io.open("C:/Users/hwang in beom/Desktop/final/full/" + fname,
                     "w",
                     encoding="utf-8") as f:
            json_str = json.dumps(data, ensure_ascii=False, default=_conv)
            f.write(json_str)

    def remove_seen(self, seen, l):
        seen = set(seen)
        return [x for x in l if not (x in seen)]

    # train, val의 곡과 태그를 플레이리스트 id를 key값으로 가지는 딕셔너리에 저장
    def get_dic(self, train, val):
        song_dic = {}
        tag_dic = {}
        data = train + val
        for q in tqdm(data):
            song_dic[str(q['id'])] = q['songs']
            tag_dic[str(q['id'])] = q['tags']
        print()
        self.song_dic = song_dic
        self.tag_dic = tag_dic

        # 여기서 토탈로 train의 곡과 태그만 보내기 때문에 모델에는 train만 학습됨
        total = list(
            map(lambda x: list(map(str, x['songs'])) + list(x['tags']), data))
        total = [x for x in total if len(x) > 1]
        self.total = total

    def get_w2v(self, total, min_count, size, window, sg):
        try:
            print("get_w2v 실행")
            if not (os.path.isfile(
                    "C:/Users/hwang in beom/Desktop/final/full/w2v_model.model"
            )):
                print("get_w2v 모델 학습 시작")
                # window가 210인 이유는 태그 10개와 곡 200개 꽉차있는 플레이리스트도 존재하기 때문이다. . iter는 반복횟수
                w2v_model = Word2Vec(total,
                                     min_count=min_count,
                                     size=size,
                                     window=window,
                                     sg=sg,
                                     iter=25)
                print("get_w2v 모델 학습 완료")
                self.w2v_model = w2v_model
                w2v_model.save(
                    "C:/Users/hwang in beom/Desktop/final/full/w2v_model.model"
                )
            print("w2v_model 모델 로드")
            self.w2v_model = Word2Vec.load(
                "C:/Users/hwang in beom/Desktop/final/full/w2v_model.model")
        except OSError as e:
            print("failed to create directory!")
            raise

    def update_p2v(self, train, val, w2v_model):
        ID = []
        vec = []
        # val에 있는 곡이나 태그들 중 train에는 없어서 예외처리되는 것을 확인하기 위한 카운트
        # 이 부분은 나중에 제거해도 상관 없음
        self.yes_songs_count = 0
        self.yes_tags_count = 0
        self.no_songs_count = 0
        self.no_tags_count = 0
        # 두개를 합치고
        for q in tqdm(train + val):
            tmp_vec = 0
            songs_vec = 0
            tags_vec = 0
            # 둘다 1 이상일 때 확인
            if len(q['songs']) >= 1 or len(q['tags']) >= 1:
                # 노래를 가지고 for문을 돌리고
                for x in q['songs']:
                    # word2vec 을 통해 백터를 가지고 온다. 이때 song의 x를 하나씩 넣어서 추가해주고 이것에 대한 개수를 센다
                    try:
                        songs_vec += w2v_model.wv.get_vector(str(x))
                        self.yes_songs_count += 1
                    except:
                        self.no_songs_count += 1
                    # song에 했던 것과 똑같이 한다.
                for y in q['tags']:
                    try:
                        tags_vec += w2v_model.wv.get_vector(str(y))
                        self.yes_tags_count += 1
                    except:
                        self.no_tags_count += 1
                # 2개를 더한다.
                tmp_vec = songs_vec + tags_vec
            # 만약에 타입이 int가 아니면 ID와 vec를 append 한다
            if type(tmp_vec) != int:
                ID.append(str(q['id']))
                vec.append(tmp_vec)
        # train, val의 플레이리스트 id에 해당하는 vector값을 구함
        self.p2v_model.add(ID, vec)

        # FastText

    def get_title(self, train):
        title_list = []
        for q in train:
            title_list.append(q['plylst_title'])
        self.title_list = title_list

    def jamo_str(self, text, BASE_CODE, CHOSUNG, JUNGSUNG, CHOSUNG_LIST,
                 JUNGSUNG_LIST, JONGSUNG_LIST):

        # 데이터 정제
        def clean_str(text):
            pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'  # E-mail제거
            text = re.sub(pattern=pattern, repl='', string=text)
            pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'  # URL제거
            text = re.sub(pattern=pattern, repl='', string=text)
            pattern = '([ㄱ-ㅎㅏ-ㅣ]+)'  # 한글 자음, 모음 제거
            text = re.sub(pattern=pattern, repl=' ', string=text)
            pattern = '<[^>]*>'  # HTML 태그 제거
            text = re.sub(pattern=pattern, repl=' ', string=text)
            pattern = '[^\w\s]'  # 특수기호제거
            text = re.sub(pattern=pattern, repl=' ', string=text)
            return text

        string = text
        string = clean_str(string)
        # print(string)
        # 리스트로 형변환
        sp_list = list(string)
        # print(sp_list)

        result = []
        for keyword in sp_list:
            # 한글 여부 check 후 분리 (keyword가 none이 아니면)
            if re.match('.*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*', keyword) is not None:
                # 만약 keyword 가 ' '면 그냥 result에 넣는다.
                if keyword == ' ':
                    result.append(' ')

                # 키워드안에 초성리스트 , 중성리스트, 종성 리스트가 들어가면 '' 을 넣는다.
                if keyword in CHOSUNG_LIST or keyword in JUNGSUNG_LIST or keyword in JONGSUNG_LIST:
                    result.append('')

                else:
                    # 초성 ord->문자의 코드값을 구한다
                    # keyword의 아스키 코드값 - basecode를 뺀다.
                    char_code = ord(keyword) - BASE_CODE
                    # char_code - 초성
                    char1 = int(char_code / CHOSUNG)
                    # 초성 리스트에서 char1의 인덱스에 해당하는 값을 넣는다.
                    result.append(CHOSUNG_LIST[char1])

                    # 중성
                    char2 = int((char_code - (CHOSUNG * char1)) / JUNGSUNG)
                    result.append(JUNGSUNG_LIST[char2])

                    # 종성
                    char3 = int(
                        (char_code - (CHOSUNG * char1) - (JUNGSUNG * char2)))
                    if char3 == 0:
                        result.append('-')
                    result.append(JONGSUNG_LIST[char3])
            # 아니면 그냥 넣는다.
            else:
                result.append(keyword)
        results_all = []
        # 리스트에서 문자열로 변환
        results_all = ("".join(result))
        # 저장
        self.results_all = results_all

    def get_title_list(self, results_all):
        # print("".join(result)) #자모 분리 결과 출력?
        title_list_detach = []
        title_list_detach.append(results_all)
        self.title_list_detach.append(title_list_detach)

    def make_title_model(self, title_list_detach):
        try:
            print("make_title_model 실행")
            if not (os.path.isfile(
                    "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model"
            )):
                print("make_title_model 모델 학습 시작")
                FT_title_model = FT_gensim(title_list_detach,
                                           size=300,
                                           window=100,
                                           min_count=1,
                                           sg=1,
                                           iter=2000)
                print("make_title_model2 모델 학습 완료")
                self.FT_title_model = FT_title_model
                FT_title_model.save(
                    "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model"
                )
            self.FT_title_model = FT_gensim.load(
                "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model"
            )
            print("make_title_model 모델 로드됨")
        except OSError as e:
            print("failed to create directory!")
            raise

    # Fasttext끝

    def get_result(self, p2v_model, song_dic, tag_dic, most_results, val,
                   train, FT_title_model, song_meta):
        title_sentence_train = []
        # train에서 한글 정제 작업을 한다. 그때 plylst_title 부분을 한다.
        for x in train:
            self.jamo_str(x['plylst_title'], self.BASE_CODE, self.CHOSUNG,
                          self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST,
                          self.JONGSUNG_LIST)
            # 위에꺼를 하면 results_all이 나오고 이 값을 title_sentence_train 넣는다.
            title_sentence_train.append(self.results_all)

        answers = []
        # 제대로 진행되고 있는지 알기 위해 세는 카운트
        # most_id는 제대로 뽑히고 있는가?
        self.most_id = []
        # ply_embedding 추천이 제대로된 플레이리스트는 몇개인가
        self.p2v_count = 0
        # 예외처리된 플레이리스트는 몇개인가
        self.except_count = 0
        # 어디서 끊기는지 정확히 알고 싶으면 옮기면서 카운트해보는 변수
        self.when_stop = 0

        # 문제유형별로 몇개의 플레이리스트가 있는 세는 카운트
        self.TNSN = 0
        self.TYSN = 0
        self.TNSY = 0
        self.TYSY = 0

        # 곡이나 태그가 100, 10개 안채워졌을 때 채우는 카운트
        self.update_song_count = 0
        self.update_tag_count = 0

        # tqdm으로 진행도를 나타내고 val의 개수 만큼 돌아가는데 enumerate를통해 몇번째 돌고 있는지를 n에 넣어서 보여준다.
        for n, q in tqdm(enumerate(val), total=len(val)):
            # 제목, 곡, 태그 유무 파악 및 개수 세기
            songs = q['songs']
            tags = q['tags']
            songs_count = len(songs)
            tags_count = len(tags)
            try:
                # 플레이리스트 임베딩하는 알고리즘(곡으로 곡추천할 때 씀)
                def ply_em(q):
                    # test or val 값을 넣고 이제 id 값을 넣었을때 유사한 것들을 가져와 most_id에 넣는다.
                    most_id = [
                        x[0]
                        for x in p2v_model.most_similar(str(q['id']), topn=15)
                    ]
                    # most_vec = [x[1] for x in p2v_model.most_similar(str(q['id']), topn=15)]

                    # 원본
                    get_song = []
                    get_tag = []

                    # most_id의 각각의 id 값을 song_dic 와 tag_dic에 넣어 노래와 태그를 얻는다.
                    for ID in most_id:
                        get_song += song_dic[ID]
                        get_tag += tag_dic[ID]

                    # 반복되는 노래에 대해 카운트를 추가하면서 카운트를 늘린다
                    count = {}
                    for i in get_song:
                        try:
                            count[i] += 1
                        except:
                            count[i] = 1
                    count = sorted(count.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

                    # 반복되는 태그에 대해 카운트를 추가하면서 카운트를 늘린다
                    count2 = {}
                    for i in get_tag:
                        try:
                            count2[i] += 1
                        except:
                            count2[i] = 1
                    count2 = sorted(count2.items(),
                                    key=lambda x: x[1],
                                    reverse=True)

                    # 이거는 위에서 봤을때 몇번째 돌고 멈추는지 체크할 때 쓰는거 같았음
                    self.when_stop += 1

                    real_get_song = []
                    real_get_tag = []

                    for song in count:
                        real_get_song.append(song[0])

                    for tag in count2:
                        real_get_tag.append(tag[0])

                    # get_song = list(pd.value_counts(get_song)[:500].index)
                    # get_tag = list(pd.value_counts(get_tag)[:20].index)

                    def to_integer(dt_time):
                        return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day

                    # 시간을 가지고 몰 하는거 같은데
                    utc_time = datetime.strptime(q['updt_date'][:26],
                                                 '%Y-%m-%d %H:%M:%S.%f')
                    updt = int(to_integer(utc_time))
                    true_get_song = []

                    # 위에서 얻은 real_get_song의 song_id를 하나씩 꺼내서
                    for song_id in real_get_song:
                        # songmeta의 song_id로 값을 찾은 뒤 issue_data를 꺼낸다.
                        issue = int(song_meta[song_id]['issue_date'])
                        # 여기서 업데이트 했던 내역 - issue를 했을때 값이 0보다 크면 넣고 적으면 안넣는다. 아마도 이상치 처리하는 듯
                        if updt - issue >= 0:
                            true_get_song.append(song_id)
                        else:
                            pass

                    answers.append({
                        "id":
                        q["id"],
                        "songs":
                        self.remove_seen(q["songs"], true_get_song)[:100],
                        "tags":
                        self.remove_seen(q["tags"], real_get_tag)[:10],
                    })
                    # 여기까지 오면 카운트 추가
                    self.p2v_count += 1

                    # FastText 알고리즘 (여기서는 곡 정보가 없을때 나머지 것들 이용했다.)

                def fasttext_title(q):

                    train_ids = []
                    get_song = []
                    get_tag = []

                    # 한글정제 (자음모음)
                    self.jamo_str(q['plylst_title'], self.BASE_CODE,
                                  self.CHOSUNG, self.JUNGSUNG,
                                  self.CHOSUNG_LIST, self.JUNGSUNG_LIST,
                                  self.JONGSUNG_LIST)
                    # 나온 값을 title에 저장
                    title = self.results_all

                    # FT_title_model이라는 것을 사용하는데 title을 넣고 가장 유사한 것들을 뽑아내는거 같음?
                    F_list = FT_title_model.wv.most_similar(title, topn=60)

                    # 여기서 나온 값들을 하나씩 뽑아내서
                    for x in F_list:
                        # title_sentence_train의 인덱스에서 F_list에서 뽑아낸 x의 [0]번째 값을 넣고 number라는 곳에 저장하고
                        number = title_sentence_train.index(x[0])
                        # train에 해당 인덱스에 id값을 빼서 train_ids에 넣는다.
                        train_ids.append(train[number]['id'])

                    # train ids의 하나하나 id 의 값을 빼서
                    for ids in train_ids:
                        # song_dic에서 해당 하는 id값을 찾아서 get_song을 만들고
                        get_song += song_dic[str(ids)]
                        # tag_dix에서 해당 하는 id값을 찾아서 get_tag에 넣는다.
                        get_tag += tag_dic[str(ids)]

                    # 여러번 나오는 값들에 +1을 계속하고 아닌 것들에 대해서는 1만 넣는다.
                    count = {}
                    for i in get_song:
                        try:
                            count[i] += 1
                        except:
                            count[i] = 1
                    # 이것을 sorted 해서 많이 나온 순서대로 정렬한다.
                    count = sorted(count.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

                    # 태그또한 여러번 나오는 i에 대해 +1을 계속한다.
                    count2 = {}
                    for i in get_tag:
                        try:
                            count2[i] += 1
                        except:
                            count2[i] = 1
                    count2 = sorted(count2.items(),
                                    key=lambda x: x[1],
                                    reverse=True)

                    real_get_song = []
                    real_get_tag = []

                    # 노래에 대해 카운트한걸 하나씩 뽑아내서 그 값에 0번째 인덱스를 real_get_song에 추가한다.
                    for song in count:
                        real_get_song.append(song[0])

                    # 태그에 대해 카운트한걸 하나씩 뽑아내서 그 값에 0번째 인덱스를 real_get_tag 추가한다.
                    for tag in count2:
                        real_get_tag.append(tag[0])

                    # get_song = list(pd.value_counts(real_get_song)[:200].index)
                    # get_tag = list(pd.value_counts(real_get_tag)[:20].index)

                    # 예외처리하는 부분 현재 플레이 리스트를 만든 연도와 들어가는 노래의 연도를 비교하여 플레이 리스트가 더 앞에 있으면 해당 하는 노래를 뺀다.
                    def to_integer(dt_time):
                        return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day

                    utc_time = datetime.strptime(q['updt_date'][:26],
                                                 '%Y-%m-%d %H:%M:%S.%f')
                    updt = int(to_integer(utc_time))
                    true_get_song = []
                    for song_id in real_get_song:
                        issue = int(song_meta[song_id]['issue_date'])
                        if updt - issue >= 0:
                            true_get_song.append(song_id)
                        else:
                            pass

                    answers.append({
                        "id":
                        q["id"],
                        "songs":
                        self.remove_seen(q["songs"], true_get_song)[:100],
                        "tags":
                        self.remove_seen(q["tags"], real_get_tag)[:10],
                    })

                # 4가지 경우의 수로 나눠 예측을 하였다.곡 자체가 없을때는 fasttext_title 로 곡정보가 있을때는 ply_em 으로 하였다.
                # 태그 X 곡 X 제목 O
                if tags_count == 0 and songs_count == 0:
                    self.TNSN += 1
                    fasttext_title(q)

                # 태그 O 곡 X 제목 X
                elif tags_count > 0 and songs_count == 0:
                    self.TYSN += 1
                    fasttext_title(q)

                # 태그 x 곡 O
                elif tags_count == 0 and songs_count > 0:
                    self.TNSY += 1
                    ply_em(q)

                # 태그 O 곡 O
                elif tags_count > 0 and songs_count > 0:
                    self.TYSY += 1
                    ply_em(q)

            except:
                # 예외처리되면 카운터 추가
                self.except_count += 1
                answers.append({
                    "id": q["id"],
                    "songs": most_results[n]['songs'],
                    "tags": most_results[n]["tags"],
                })

        # check and update answer
        for n, q in enumerate(answers):
            if len(q['songs']) != 100:
                answers[n]['songs'] += self.remove_seen(
                    q['songs'],
                    self.most_results[n]['songs'])[:100 - len(q['songs'])]
                self.update_song_count += 1
            if len(q['tags']) != 10:
                answers[n]['tags'] += self.remove_seen(
                    q['tags'],
                    self.most_results[n]['tags'])[:10 - len(q['tags'])]
                self.update_tag_count += 1
        self.answers = answers

    def run(self):
        # Word2Vec ply_embedding - Word2Vec를 통해 플레이 리스트를 밀집으로 표현

        # train, val의 곡과 태그를 플레이리스트 id를 key값으로 가지는 딕셔너리에 저장
        self.get_dic(self.train, self.val)

        # word2vec의 요소들을 넣어서 w2v를 실행함 - 옵션에 맞춰서 word2vec을 실행한다.
        # total - train과 val 데이터를 합치고 그 합친것에서 곡과 태그를 빼내 하나의 리스트로 만들어준다.
        # , min_count - 1번이상 연관이 있어야 학습, size - 의미를 담을 벡터를 150차원으로 만든다.?
        # window - 중심단어 기준으로 앞뒤로 210개 범위까지 학습, sg - 1이면 skip-gram, 아니면 CBOW
        # CBOW - 주변 단어들을 통해 중간의 단어를 예측하는 모델
        # Skip-Gram 은 중심 단어를 통해 주변단어를 예측하는 모델
        # 이 값을 word2vec에 넣고 model을 학습하고 저장한다. 그다음 이걸 로드해서 self에 저장해놓는다.
        self.get_w2v(self.total, self.min_count, self.size, self.window,
                     self.sg)

        # p2v_model 에 값을 추가하는 작업
        self.update_p2v(self.train, self.val, self.w2v_model)

        # FastText ply_title - facebook에서 제공해주는 것으로 play title을 생성

        # train의 playlist title을 title_list라는 것에 저장한다 만든다.
        self.get_title(self.train)

        # title list와 초성,중성,종성등의 값을 넣고 데이터를 정제한다.
        for string in self.title_list:
            self.jamo_str(string, self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG,
                          self.CHOSUNG_LIST, self.JUNGSUNG_LIST,
                          self.JONGSUNG_LIST)
            # 위에꺼를 하면 results_all이 나오고 이 값을 title_list_detach에 넣는다.
            self.get_title_list(self.results_all)
        self.make_title_model(self.title_list_detach)

        # 곡과 태그 채우는 함수
        #  WordEmbeddingsKeyedVectors / song_dic / tag_dic / most_popular_res 데이터 / val 데이터 / train 데이터 / FT_gensim 해서 나온 결과 / song_meta 넣기
        self.get_result(self.p2v_model, self.song_dic, self.tag_dic,
                        self.most_results, self.val, self.train,
                        self.FT_title_model, self.song_meta)

        # self.write_json(self.answers, '/content/drive/MyDrive/Colab Notebooks/final/test10/results2.json')
        # self.write_json(self.answers, 'results50000.json')

        print("results 작성 완료")

    def train_model(self):
        # Word2Vec ply_embedding
        self.get_dic(self.train, self.val)
        self.get_w2v(self.total, self.min_count, self.size, self.window,
                     self.sg)
        self.update_p2v(self.train, self.val, self.w2v_model)

        # FastText ply_title
        self.get_title(self.train)
        for string in self.title_list:
            self.jamo_str(string, self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG,
                          self.CHOSUNG_LIST, self.JUNGSUNG_LIST,
                          self.JONGSUNG_LIST)
            self.get_title_list(self.results_all)
        self.make_title_model(self.title_list_detach)
Esempio n. 15
0
def run(song_meta_data, train_data, test_data):
    train_data['updt_year'] = train_data['updt_date'].str.slice(start=0,
                                                                stop=4)
    test_data['updt_year'] = test_data['updt_date'].str.slice(start=0, stop=4)
    song_meta_data['issue_year'] = song_meta_data['issue_date'].str.slice(
        start=0, stop=4)
    song_meta_data['id'] = song_meta_data['id'].astype(str)

    print("Tokenize...")
    tokenize(train_data, test_data)

    train_data = train_data.sort_values(by='updt_date').reset_index(drop=True)
    test_data = test_data.sort_values(by='updt_date').reset_index(drop=True)
    print("Total Dict Loading")
    if os.path.exists(
            BASE_DIR + 'model/total_data_final.pickle') and os.path.exists(
                BASE_DIR + 'model/song_dict_final.pickle') and os.path.exists(
                    BASE_DIR +
                    'model/tag_dict_final.pickle') and os.path.exists(
                        BASE_DIR + 'model/title_dict_final.pickle'):
        with open(BASE_DIR + 'model/total_data_final.pickle', 'rb') as handle:
            total_data = pickle.load(handle)
        with open(BASE_DIR + 'model/song_dict_final.pickle', 'rb') as handle:
            song_dict = pickle.load(handle)
        with open(BASE_DIR + 'model/tag_dict_final.pickle', 'rb') as handle:
            tag_dict = pickle.load(handle)
        with open(BASE_DIR + 'model/title_dict_final.pickle', 'rb') as handle:
            title_dict = pickle.load(handle)
    else:
        print("Total Dict Not Existing... Calculating")
        total_data, song_dict, tag_dict, title_dict = getTotalDict(
            train_data, test_data)
        with open(BASE_DIR + 'model/total_data_final.pickle', 'wb') as handle:
            pickle.dump(total_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/song_dict_final.pickle', 'wb') as handle:
            pickle.dump(song_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/tag_dict_final.pickle', 'wb') as handle:
            pickle.dump(tag_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/title_dict_final.pickle', 'wb') as handle:
            pickle.dump(title_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Frequency Loading...")
    if os.path.exists(BASE_DIR +
                      'model/tag_freq_by_song.pickle') and os.path.exists(
                          BASE_DIR + 'model/song_freq_by_tag.pickle'):
        with open(BASE_DIR + 'model/tag_freq_by_song.pickle', 'rb') as handle:
            tag_freq_by_song = pickle.load(handle)
        with open(BASE_DIR + 'model/song_freq_by_tag.pickle', 'rb') as handle:
            song_freq_by_tag = pickle.load(handle)
    else:
        print("Frequency Not Existing... Calculating")
        tag_freq_by_song, song_freq_by_tag = getFreqDict(train_data)
        with open(BASE_DIR + 'model/tag_freq_by_song.pickle', 'wb') as handle:
            pickle.dump(tag_freq_by_song,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/song_freq_by_tag.pickle', 'wb') as handle:
            pickle.dump(song_freq_by_tag,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

    print("Update Date Loading...")
    if os.path.exists(BASE_DIR + 'model/updt_dict.pickle'):
        with open(BASE_DIR + 'model/updt_dict.pickle', 'rb') as handle:
            updt_dict = pickle.load(handle)
    else:
        print("Update Date Not Existing... Calculating")
        updt_dict = getUpdtDict(song_meta_data)
        with open(BASE_DIR + 'model/updt_dict.pickle', 'wb') as handle:
            pickle.dump(updt_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Song Popularity Loading...")
    if os.path.exists(BASE_DIR +
                      'model/popular_song_by_year.pickle') and os.path.exists(
                          BASE_DIR + 'model/popular_tag_by_year.pickle'):
        with open(BASE_DIR + 'model/popular_tag_by_year.pickle',
                  'rb') as handle:
            popular_tag_by_year = pickle.load(handle)
        with open(BASE_DIR + 'model/popular_song_by_year.pickle',
                  'rb') as handle:
            popular_song_by_year = pickle.load(handle)
    else:
        print("Song Popularity Not Existing... Calculating")
        popular_song_by_year, popular_tag_by_year = getPopularDict(train_data)
        with open(BASE_DIR + 'model/popular_tag_by_year.pickle',
                  'wb') as handle:
            pickle.dump(popular_tag_by_year,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/popular_song_by_year.pickle',
                  'wb') as handle:
            pickle.dump(popular_song_by_year,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

    print("Word2Vec Model Loading...")
    if os.path.exists(BASE_DIR + 'model/w2v_model_sg_title.model'):
        w2v_model = Word2Vec.load(BASE_DIR + 'model/w2v_model_sg_title.model')
    else:
        print("Word2Vec Model Not Found !")
        print("Training...")
        w2v_model = Word2Vec(total_data,
                             min_count=3,
                             size=100,
                             window=210,
                             sg=1)
        w2v_model.save(BASE_DIR + 'model/w2v_model_sg_title.model')

    print("Training...")
    p2v_model = WordEmbeddingsKeyedVectors(100)
    updateP2V(train_data, test_data, w2v_model, p2v_model, song_dict, tag_dict,
              title_dict)

    print("Word2Vec Second Model Loading...")
    if os.path.exists(BASE_DIR +
                      'model/w2v_tag_final.model') and os.path.exists(
                          BASE_DIR + 'model/w2v_song_final.model'):
        tag_model = Word2Vec.load(BASE_DIR + 'model/w2v_tag_final.model')
        song_model = Word2Vec.load(BASE_DIR + 'model/w2v_song_final.model')
        mt = W2VModel(tag_model, "tags")
        ms = W2VModel(song_model, "songs")
    else:
        print("Word2Vec Second Model Not Found !")
        print("Tag Training...")
        mt = W2VModel(pd.concat([train_data, test_data]), "tags")
        mt.model.save(BASE_DIR + 'model/w2v_tag_final.model')
        print("Song Training...")
        ms = W2VModel(pd.concat([train_data, test_data]), "songs")
        ms.model.save(BASE_DIR + 'model/w2v_song_final.model')

    print("start")
    answer = []
    for i, row in tqdm(test_data.iterrows()):
        year = str(row['updt_year'])
        id = str(row['id'])
        songs = []
        tags = []
        try:
            most_id_list = [x[0] for x in p2v_model.most_similar(id, topn=200)]
            fillAnswer(getItemById(most_id_list, song_dict, 200), songs, 100,
                       song_dict, id, updt_dict, year)
            fillAnswer(getItemById(most_id_list, tag_dict, 20), tags, 10,
                       tag_dict, id)
        except:
            pass

        if len(songs) < 100:
            fillAnswer(ms.recommand(test_data, int(row['id']), 200), songs,
                       100, song_dict, id, updt_dict, year)

        if len(tags) < 10:
            fillAnswer(mt.recommand(test_data, int(row['id']), 20), tags, 10,
                       tag_dict, id)

        if len(songs) < 100:
            fillAnswer(getSongByTagFreq(song_freq_by_tag, row['tags'], 200),
                       songs, 100, song_dict, id, updt_dict, year)
        if len(tags) < 10:
            fillAnswer(getTagBySongFreq(tag_freq_by_song, row['songs'], 20),
                       tags, 10, tag_dict, id)

        if len(songs) < 100:
            fillAnswer(getSongByYear(popular_song_by_year, year, 200), songs,
                       100, song_dict, id, updt_dict, year)
        if len(tags) < 10:
            fillAnswer(getTagByYear(popular_tag_by_year, year, 20), tags, 10,
                       tag_dict, id)

        if len(songs) < 100:
            try:
                fillAnswer(
                    getSongByYear(popular_song_by_year, str(int(year) - 1),
                                  20), songs, 100, song_dict, id, updt_dict,
                    year)
            except:
                fillAnswer(
                    getSongByYear(popular_song_by_year, str(int(year) + 1),
                                  200), songs, 100, song_dict, id, updt_dict,
                    year)
        if len(tags) < 10:
            try:
                fillAnswer(
                    getTagByYear(popular_tag_by_year, str(int(year) - 1), 20),
                    tags, 10, tag_dict, id)
            except:
                fillAnswer(
                    getTagByYear(popular_tag_by_year, str(int(year) + 1), 200),
                    tags, 10, tag_dict, id)

        if len(songs) < 100:
            print("song 의 개수가 적습니다. id : ", str(row['id']), str(year))
        if len(tags) < 10:
            print("tag 의 개수가 적습니다. id : ", str(row['id']), str(year))

        answer.append({"id": row["id"], "songs": songs, "tags": tags})

    write_json(answer, "results.json")
Esempio n. 16
0
weights = net.emb.weight.detach().cpu().numpy()

# embedding = WordEmbeddingsKeyedVectors(vector_size=300)
# for i, n in enumerate(word2index.keys()):
#     embedding.add(entities=n, weights=net.word_embeddings(n).cpu().detach())
#     if not i % 100:
#         print(f'{i}, {n}')
#
# embedding.save(os.path.join(data_path, 'keyed_values.dir'))

# =====================================================================================
def analogy(x1, x2, y1):
    result = embedding.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

embedding = WordEmbeddingsKeyedVectors.load(os.path.join(data_path, 'keyed_values.dir'))
print(analogy('estimate', 'estimates', 'find'))

accuracy, result = embedding.evaluate_word_analogies(os.path.join(data_path, 'intrinsic_test.txt'))
print(accuracy)
for r in result:
    correct_len = len(r['correct'])
    incorrect_len = len(r['incorrect'])
    print(f'{r["section"]}: {correct_len} / {(correct_len + incorrect_len)}')

# =====================================================================================

from gensim.test.utils import datapath

print(
    (embedding.n_similarity(["king"], ["duke"]),
Esempio n. 17
0
class Title2Rec:
    def __init__(self):
        super().__init__()
        self.cluster_model = None
        self.fasttext = None
        self.t2r = None
        self.good_tags = ['NNG', 'NNP', 'NNB', 'NP', 'NR',
        'VA', 'MAG', 'SN', 'SL']
        self.khaiii = KhaiiiApi()
    
    ## fit clustering
    def fit_clustering(self, vectors,
                   n_clusters, verbose=0, max_iter=50):
        self.cluster_model = KMeans(n_clusters=n_clusters, verbose=verbose,
                            max_iter=max_iter)
        print("Data length: ", len(vectors))
        print("Fit KMeans...")
        self.cluster_model.fit(vectors)
        print("done.")

    ## preprocess for clustering
    def preprocess_clustering(self, titles, vectors, ID=True, khaiii=True, verbose=False):
        ## t: title / v: vectors / i : plylst id
        if ID:
            id_list = list(map(lambda x: x.split(' ')[0][1:-1], titles))
            titles = list(map(lambda x: ' '.join(x.split(' ')[1:]), titles))
        else:
            id_list = list(range(len(titles)))
        t_v = list(zip(titles, vectors, id_list))
        stable = [(t, v, i) for t, v, i in t_v if re.findall('[가-힣a-zA-Z&]+', t) != []]
        stable = [(' '.join(re.findall('[가-힣a-zA-Z&]+|90|80|70', t)), v, i) for t, v, i in stable]
        stable = [(t, v, i) for t, v, i in stable if t != '']

        ## title morph analysis by Khaiii
        def tag_process(title, khaiii, good_tags):
            token = khaiii.analyze(title)
            ## join : space bar between list element
            return ' '.join([morph.lex for to in token for morph in to.morphs if morph.tag in good_tags])

        if khaiii:
            if verbose:
                stable = [(tag_process(t, self.khaiii, self.good_tags), v, i) for t, v, i in tqdm(stable)]
                stable = [(t, v, i) for t, v, i in stable if t != '']
            else:
                stable = [(tag_process(t, self.khaiii, self.good_tags), v, i) for t, v, i in stable]
                stable = [(t, v, i) for t, v, i in stable if t != '']

        titles = [t for t, v, i in stable]
        vectors = [v for t, v, i in stable]
        id_list = [i for t, v, i in stable]
        if verbose:
            print("Original lenght: ", len(t_v))
            print("Processed length: ", len(titles))
        
        return titles, vectors, id_list

    ## cleansing text before Khaiii
    @staticmethod
    def text_process(titles, ID=True):
        if ID:
            titles = list(map(lambda x: ' '.join(x.split(' ')[1:]), titles))
        stable = [x for x in titles if re.findall('[가-힣a-zA-Z&]+', x) != []]
        stable = [' '.join(re.findall('[가-힣a-zA-Z&]+|90|80|70', x)) for x in stable]
        stable = [x for x in stable if x != '']
        print("Only hangul & alpha & and sign.")
        print("Original lenght: ", len(titles))
        print("Processed length: ", len(stable))
        
        return stable

    ## predict cluster with cluster model, return clusters sorted by distance
    def pre_fasttext(self, titles, vectors):
        if not self.cluster_model:
            raise RuntimeError("Please fit clustering model.")
        cluster_out = self.cluster_model.predict(vectors)
        transform = self.cluster_model.transform(vectors)
        dist = [distance[cluster] for cluster, distance in zip(cluster_out, transform)]
        data = pd.DataFrame({'title': titles,
                             'cluster': cluster_out,
                             'distance': dist})
        return data.sort_values(['cluster', 'distance'])

    ## mk Fasttext model with cluster(500)
    def fit_fasttext(self, data):
        sentence = data.groupby('cluster')['title'].apply(list).tolist()
        print("Fit fasttext...")
        self.fasttext = FastText(sentence)
        print('done.')

    ## mk title2rec model
    def fit_title2rec(self, titles, ID):
        keys = [i + " " + t for t, i in zip(titles, ID)]
        print('Fit title2rec...')
        vectors = list(map(self.fasttext.wv.get_vector, titles))
        self.t2r = WordEmbeddingsKeyedVectors(vector_size=100)
        self.t2r.add(keys, vectors)
        print('done.')

    ## get title vectors from fasttext model ( most similar 10 - default)
    def forward(self, titles, topn=10):
        ft = list(map(self.fasttext.wv.get_vector, titles))
        out = [self.t2r.wv.similar_by_vector(t, topn=topn) for t in ft]
        return out

    ## load cluster model
    def load_cluster(self, fname):
        self.cluster_model = joblib.load(fname)
        print("load complete")

    ## load fasttext model
    def load_fasttext(self, path):
        self.fasttext = gensim.models.FastText.load(path)

    ## load title to songs model
    def load_t2r(self, path):
        self.t2r = gensim.models.KeyedVectors.load(path)

    def title2rec(self, ply, song_n, tag_n, song_const, tag_const, khaiii=True):
        title, _, _ = self.preprocess_clustering([ply['plylst_title']], [None], ID=False, khaiii=khaiii, verbose=False)
        if title == []:
            if ply['tags'] != []:
                return ply['songs'], ply['tags'], 1, 0
            else:
                return ply['songs'], ply['tags'], 1, 1

        title = title[0]
        similars = self.forward([title], topn=200)[0]

        ID = [int(sim[0].split(" ")[0]) for sim in similars]
        similar = [sim[1] for sim in similars]

        tmp_df = pd.DataFrame({'id':ID, 'similar':similar})
        tmp_df = pd.merge(tmp_df, train_df[['id', 'songs', 'tags']], how='left', on='id')
        tmp_df['song_len'] = tmp_df['songs'].apply(len)
        tmp_df['song_len'] = tmp_df['song_len'].cumsum().shift(1).fillna(0)
        song_df = tmp_df[tmp_df['song_len'] < 2000]

        score_dict = {}
        for sim, songs in zip(song_df['similar'], song_df['songs']):
            for i, song in enumerate(songs):
                score = (-math.log(i+1, 2) + song_const) * sim
                try:
                    score_dict[song] += score
                except KeyError:
                    score_dict[song] = score

        pick = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
        pick = [p[0] for p in pick]
        song_res = pick[:song_n]
        # date = pd.to_datetime(ply['updt_date'])
        # pick = [p for p in pick if song_date[p] <= date]
        # song_res = pick[:song_n]

        if len(song_res) < song_n:
            song_df = tmp_df[tmp_df['song_len'] >= 2000]
            for sim, songs in zip(song_df['similar'], song_df['songs']):
                for i, song in enumerate(songs):
                    score = (-math.log(i+1, 2) + song_const) * sim
                    try:
                        score_dict[song] += score
                    except KeyError:
                        score_dict[song] = score
            pick = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
            pick = [p[0] for p in pick]
            # pick = [p for p in pick if song_date[p] <= date]
            song_res = pick[:song_n]
        
        # assert len(song_res) == song_n

        # song_res = [p[0] for p in pick]
        
        if ply['tags'] != []:
            return song_res, ply['tags'], 1, 0

        tmp_df['tag_len'] = tmp_df['tags'].apply(len)
        tmp_df['tag_len'] = tmp_df['tag_len'].cumsum().shift(1).fillna(0)
        tag_df = tmp_df[tmp_df['tag_len'] < 150]

        score_dict = {}
        for sim, tags in zip(tag_df['similar'], tag_df['tags']):
            for i, tag in enumerate(tags):
                score = (-math.log(i+1, 2) + tag_const) * sim
                try:
                    score_dict[tag] += score
                except KeyError:
                    score_dict[tag] = score

        pick = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)[:tag_n]
        tag_res = [p[0] for p in pick]

        return song_res, tag_res, 1, 1
Esempio n. 18
0
if __name__ == "__main__":
    _, words, freq = extract_words(5000)
    _, _, finder = build_word_context_model(words)
    M1_plus = compute_ppmi(finder, words)
    M2_10 = apply_pca(M1_plus, 10, words)
    M2_100 = apply_pca(M1_plus, 100, words)
    M2_300 = apply_pca(M1_plus, 300, words)
    W2V = KeyedVectors.load_word2vec_format(
        './GoogleNews-vectors-negative300.bin', binary=True)

    SM2_300 = evaluate_cosine(M2_300)
    print("pca_300 correlation {}".format(calc_pearson(SM2_300)[0]))
    SW2V = evaluate_cosine(W2V, False)
    print("word2vec correlation {}".format(calc_pearson(SW2V)[0]))

    SM_keyed = WordEmbeddingsKeyedVectors(300)
    SM_keyed.add(words, M2_300.to_numpy())

    M10_keyed = WordEmbeddingsKeyedVectors(10)
    M10_keyed.add(words, M2_10.to_numpy())

    M100_keyed = WordEmbeddingsKeyedVectors(100)
    M100_keyed.add(words, M2_100.to_numpy())

    tests = [(W2V, './word-test.v1.txt'), (W2V, './filtered-test.txt'),
             (M10_keyed, './word-test.v1.txt'),
             (M10_keyed, './filtered-test.txt'),
             (M100_keyed, './word-test.v1.txt'),
             (M100_keyed, './filtered-test.txt'),
             (SM_keyed, './word-test.v1.txt'),
             (SM_keyed, './filtered-test.txt')]
Esempio n. 19
0
def infer(MODE="Test"):
    mode_opt = {
        "Valid": {
            "train_path": "arena_data/orig/train.json",
            "test_path": "arena_data/questions/val.json",
            "results_path": "cf2/val/results.json",
            "eval": True
        },
        "Dev": {
            "train_path": "res/train.json",
            "test_path": "res/val.json",
            "results_path": "cf2/dev/results.json",
            "eval": False
        },
        "Test": {
            "train_path": "res/train.json",
            "test_path": "res/test.json",
            "results_path": "cf2/test/results.json",
            "eval": False
        }
    }
    opt = mode_opt[MODE]

    train = pd.read_json(opt["train_path"])
    test = pd.read_json(opt["test_path"])

    if MODE != "Dev":
        dev = pd.read_json("res/val.json")

    if MODE != "Test":
        test_res = pd.read_json("res/test.json")

    print("Preprocessing dates")
    test_date = {}
    for i in tqdm(test.index):
        test_date[test.at[i, 'id']] = test.at[i, 'updt_date']

    song_meta = pd.read_json("res/song_meta.json")

    song_date = {}
    for i in tqdm(song_meta.index):
        song_date[song_meta.at[i, "id"]] = str(song_meta.at[i, "issue_date"])

    del song_meta

    song_update_date = []
    for i in train.index:
        updt_date = train.loc[i, 'updt_date'][:4] + train.loc[
            i, 'updt_date'][5:7] + train.loc[i, 'updt_date'][8:10]
        for t in train.loc[i, 'songs']:
            if song_date[t] > updt_date:
                song_date[t] = updt_date
                song_update_date.append(t)
    for i in test.index:
        updt_date = test.loc[i, 'updt_date'][:4] + test.loc[
            i, 'updt_date'][5:7] + test.loc[i, 'updt_date'][8:10]
        for t in test.loc[i, 'songs']:
            if song_date[t] > updt_date:
                song_date[t] = updt_date
                song_update_date.append(t)
    if MODE != "Dev":
        for i in dev.index:
            updt_date = dev.loc[i, 'updt_date'][:4] + dev.loc[
                i, 'updt_date'][5:7] + dev.loc[i, 'updt_date'][8:10]
            for t in dev.loc[i, 'songs']:
                if song_date[t] > updt_date:
                    song_date[t] = updt_date
                    song_update_date.append(t)
    if MODE != "Test":
        for i in test_res.index:
            updt_date = test_res.loc[i, 'updt_date'][:4] + test_res.loc[
                i, 'updt_date'][5:7] + test_res.loc[i, 'updt_date'][8:10]
            for t in test_res.loc[i, 'songs']:
                if song_date[t] > updt_date:
                    song_date[t] = updt_date
                    song_update_date.append(t)
    print("The number of processed songs :", len(set(song_update_date)))

    # Loading tags extracted from tiltle
    pred_tag = load_json("arena_data/model/pred_tag.json")

    dic_pred_tag = {}
    for p_t in pred_tag:
        dic_pred_tag[p_t['id']] = p_t['predict_tag']

    train['tags_org'] = train['tags']
    for i in train.index:
        train.at[i,
                 'tags'] = train.at[i, 'tags'] + dic_pred_tag[train.at[i,
                                                                       'id']]

    test['tags_org'] = test['tags']
    for i in test.index:
        test.at[i,
                'tags'] = test.at[i, 'tags'] + dic_pred_tag[test.at[i, 'id']]

    if MODE != "Dev":
        dev['tags_org'] = dev['tags']
        for i in dev.index:
            dev.at[i,
                   'tags'] = dev.at[i, 'tags'] + dic_pred_tag[dev.at[i, 'id']]

    if MODE != "Test":
        test_res['tags_org'] = test_res['tags']
        for i in test_res.index:
            test_res.at[i, 'tags'] = test_res.at[i, 'tags'] + dic_pred_tag[
                test_res.at[i, 'id']]

    # Calculating IDF
    inv_doc_freq = {}
    for d in train['songs'] + train['tags']:
        for i in d:
            if i in inv_doc_freq:
                inv_doc_freq[i] += 1
            else:
                inv_doc_freq[i] = 1

    for d in test['songs'] + test['tags']:
        for i in d:
            if i in inv_doc_freq:
                inv_doc_freq[i] += 1
            else:
                inv_doc_freq[i] = 1

    if MODE != "Dev":
        for d in dev['songs'] + dev['tags']:
            for i in d:
                if i in inv_doc_freq:
                    inv_doc_freq[i] += 1
                else:
                    inv_doc_freq[i] = 1

    if MODE != "Test":
        for d in test_res['songs'] + test_res['tags']:
            for i in d:
                if i in inv_doc_freq:
                    inv_doc_freq[i] += 1
                else:
                    inv_doc_freq[i] = 1

    for k in inv_doc_freq:
        if MODE == "Valid":
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(dev) + len(test_res)) /
                inv_doc_freq[k])
        elif MODE == "Dev":
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(test_res)) / inv_doc_freq[k])
        else:
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(dev)) / inv_doc_freq[k])

    # Preprocessing data for CF matrix
    if MODE == "Valid":
        n_train = len(train) + len(dev) + len(test_res)
    elif MODE == "Dev":
        n_train = len(train) + len(test_res)
    else:
        n_train = len(train) + len(dev)
    n_test = len(test)

    # train + test
    if MODE == "Valid":
        plylst = pd.concat([train, dev, test_res, test], ignore_index=True)
    elif MODE == "Dev":
        plylst = pd.concat([train, test_res, test], ignore_index=True)
    else:
        plylst = pd.concat([train, dev, test], ignore_index=True)

    # playlist id
    plylst["nid"] = range(n_train + n_test)

    # nid -> id
    plylst_nid_id = dict(zip(plylst["nid"], plylst["id"]))

    plylst_tag = plylst['tags']
    tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
    tag_dict = {x: tag_counter[x] for x in tag_counter}

    id_type = dict()

    tag_id_tid = dict()
    tag_tid_id = dict()
    for i, t in enumerate(tag_dict):
        tag_id_tid[t] = i
        tag_tid_id[i] = t
        id_type[t] = 1

    n_tags = len(tag_dict)

    plylst_song = plylst['songs']
    song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
    song_dict = {x: song_counter[x] for x in song_counter}

    song_id_sid = dict()
    song_sid_id = dict()
    for i, t in enumerate(song_dict):
        song_id_sid[t] = i
        song_sid_id[i] = t
        id_type[t] = 1

    n_songs = len(song_dict)

    plylst_st = plylst['songs'] + plylst['tags']
    st_counter = Counter([st for sts in plylst_st for st in sts])
    st_dict = {x: st_counter[x] for x in st_counter}

    st_id_tid = dict()
    st_tid_id = dict()
    for i, t in enumerate(st_dict):
        st_id_tid[t] = i
        st_tid_id[i] = t

    n_sts = len(st_dict)

    print("Tags : ", n_tags, ", Songs : ", n_songs, ", Total : ", n_sts)

    plylst['songs_id'] = plylst['songs'].map(
        lambda x:
        [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
    plylst['tags_id'] = plylst['tags_org'].map(
        lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])
    plylst['sts_id'] = (plylst['songs'] + plylst['tags']).map(
        lambda x: [st_id_tid.get(st) for st in x if st_id_tid.get(st) != None])

    plylst_use = plylst[['nid', 'updt_date', 'songs_id', 'tags_id', 'sts_id']]
    plylst_use.loc[:, 'num_songs'] = plylst_use['songs_id'].map(len)
    plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len)
    plylst_use.loc[:, 'num_sts'] = plylst_use['sts_id'].map(len)
    plylst_use = plylst_use.set_index('nid')

    plylst_train = plylst_use.iloc[:, :]
    plylst_test = plylst_use.iloc[n_train:, :]

    n_train = len(plylst_train)

    np.random.seed(33)
    test_set = plylst_test
    print("The number of test samples : ", len(test_set))

    # Building CF matrices
    avg_len_songs = 0
    for songs in plylst_train['songs_id']:
        avg_len_songs += len(songs)
    avg_len_songs /= len(plylst_train['songs_id'])

    avg_len_tags = 0
    for tags in plylst_train['tags_id']:
        avg_len_tags += len(tags)
    avg_len_tags /= len(plylst_train['tags_id'])

    avg_len_sts = 0
    for sts in plylst_train['sts_id']:
        avg_len_sts += len(sts)
    avg_len_sts /= len(plylst_train['sts_id'])

    row = np.repeat(range(n_train), plylst_train['num_songs'])
    col = [song for songs in plylst_train['songs_id'] for song in songs]
    dat = [1 for songs in plylst_train['songs_id'] for song in songs]
    train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

    row = np.repeat(range(n_train), plylst_train['num_tags'])
    col = [tag for tags in plylst_train['tags_id'] for tag in tags]
    dat = [1 for tags in plylst_train['tags_id'] for tag in tags]
    train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

    row = np.repeat(range(n_train), plylst_train['num_sts'])
    col = [st for sts in plylst_train['sts_id'] for st in sts]
    dat = [
        inv_doc_freq[st_tid_id[st]] / (len(sts) + 50)
        for sts in plylst_train['sts_id'] for st in sts
    ]
    train_sts_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_sts))

    train_songs_A_T = train_songs_A.T.tocsr()
    train_tags_A_T = train_tags_A.T.tocsr()

    # Building map playlist id to songs or tags for playlist2vec
    if MODE == "Valid":
        p2v_targets = [train, test, dev, test_res]
    elif MODE == "Dev":
        p2v_targets = [train, test, test_res]
    else:
        p2v_targets = [train, test, dev]

    song_dic = {}
    tag_dic = {}
    for i, q in tqdm(pd.concat(p2v_targets).iterrows()):
        song_dic[str(q['id'])] = q['songs']
        tag_dic[str(q['id'])] = q['tags_org']

    # Loading playlist embedding vectors
    p2v_song = WordEmbeddingsKeyedVectors.load(
        "arena_data/model/p2v_song.model")
    p2v_tag = WordEmbeddingsKeyedVectors.load("arena_data/model/p2v_tag.model")

    print("Predicting")
    res = []
    filtered_lot_song = []
    filtered_lot_tag = []
    for pid in tqdm(test_set.index):
        songs_already = test_set.loc[pid, "songs_id"]
        tags_already = test_set.loc[pid, "tags_id"]

        # Song prediction - 1. Query vector to predict songs
        p = np.zeros((n_sts, 1))
        if len(test_set.loc[pid, 'sts_id']) > 0:
            for st in test_set.loc[pid, 'sts_id']:
                if st_tid_id[st] in inv_doc_freq:
                    p[st] = inv_doc_freq[st_tid_id[st]] / (
                        len(test_set.loc[pid, 'sts_id']) + 50)

        # Song prediction - 2. K-nn playlists
        val = train_sts_A.dot(p).reshape(-1)

        val_idx = val.reshape(-1).argsort()[-250:][::-1]

        val_knn = np.zeros((n_train))
        val_knn[val_idx] = val[val_idx]

        val = val_knn**2

        # Song prediction - 3. Candidates
        cand_song = train_songs_A_T.dot(val)

        # Song prediction - 4. Rescoring using playlist2vec
        dic_song_score = {}
        if str(plylst_nid_id[pid]) in p2v_song.wv.vocab:
            most_id = [
                x for x in p2v_song.most_similar(str(plylst_nid_id[pid]),
                                                 topn=50)
            ]
            for ID in most_id:
                for s in song_dic[ID[0]]:
                    if s in dic_song_score:
                        dic_song_score[s] += ID[1]
                    else:
                        dic_song_score[s] = ID[1]

        for k in dic_song_score:
            cand_song[song_id_sid[k]] *= dic_song_score[k]**0.2

        cand_song_idx = cand_song.reshape(-1).argsort()[-5000:][::-1]

        # Song prediction - 5. Filtering by score and date
        cand_song_idx_filtered = []
        for cand in cand_song_idx:
            if cand_song[cand] > 0 and song_date[song_sid_id[
                    cand]] <= test_date[plylst_nid_id[pid]][:4] + test_date[
                        plylst_nid_id[pid]][5:7] + test_date[
                            plylst_nid_id[pid]][8:10]:
                cand_song_idx_filtered.append(cand)
        if len(cand_song_idx_filtered) < 400:
            filtered_lot_song.append(len(cand_song_idx_filtered))
        cand_song_idx = np.array(cand_song_idx_filtered)

        # Song prediction - 6. Rescoring using heuristics
        dict_score = {}
        for idx in cand_song_idx:
            dict_score[idx] = cand_song[idx]

        mean_doc_freq = 0
        std_doc_freq = 0
        list_doc_freq = []
        mean_song_date = 0
        list_song_date = []
        if len(test_set.loc[pid, "songs_id"]) > 0:
            for t in test_set.loc[pid, "songs_id"]:
                if song_sid_id[t] in inv_doc_freq:
                    list_doc_freq.append(inv_doc_freq[song_sid_id[t]])
                song_d = int(song_date[song_sid_id[t]])
                if song_d > 19000000 and song_d < 20210000:
                    list_song_date.append(song_d)
            if len(list_doc_freq) > 0:
                mean_doc_freq = np.mean(list_doc_freq)
                std_doc_freq = np.std(list_doc_freq)
            if len(list_song_date) > 0:
                mean_song_date = np.mean(list_song_date)

        # Song prediction - 6-1. Rescoring by IDF comparison
        if len(list_doc_freq) > 0:
            for c in dict_score:
                if song_sid_id[c] in inv_doc_freq:
                    dict_score[c] = 1 / (
                        len(list_doc_freq)**0.5) * dict_score[c] + (
                            1 - 1 /
                            (len(list_doc_freq)**0.5)) * dict_score[c] * 2 / (
                                np.abs(inv_doc_freq[song_sid_id[c]] -
                                       mean_doc_freq) / (std_doc_freq + 1) + 2)
                else:
                    dict_score[c] = 1 / (len(list_doc_freq)**
                                         0.5) * dict_score[c]

        # Song prediction - 6-2. Rescoring by Date comparison
        if len(list_song_date) > 0:
            for c in dict_score:
                song_d = int(song_date[song_sid_id[c]])
                if song_d > 19000000 and song_d < 20210000:
                    dict_score[c] = 1 / (
                        len(list_song_date)**0.5) * dict_score[c] + (
                            1 - 1 /
                            (len(list_song_date)**0.5)) * dict_score[c] / (
                                np.abs(song_d - mean_song_date) / 500000 + 1)
                else:
                    dict_score[c] = 1 / (len(list_song_date)**
                                         0.5) * dict_score[c]

        score_sorted = sorted(dict_score.items(),
                              key=lambda x: x[1],
                              reverse=True)

        cand_song_idx = []
        for t in score_sorted:
            cand_song_idx.append(t[0])
        cand_song_idx = np.array(cand_song_idx)

        cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) ==
                                      False][:300]
        rec_song_idx = [song_sid_id[i] for i in cand_song_idx]

        # Tag prediction - 1. Query vector to predict tags
        p = np.zeros((n_sts, 1))
        p[test_set.loc[pid, 'sts_id']] = 1

        # Tag prediction - 2. K-nn playlists
        val = train_sts_A.dot(p).reshape(-1)

        val_idx = val.reshape(-1).argsort()[-250:][::-1]

        val_knn = np.zeros((n_train))
        val_knn[val_idx] = val[val_idx]

        val = val_knn**2

        # Tag prediction - 3. Candidates
        cand_tag = train_tags_A_T.dot(val)

        # Tag prediction - 4. Rescoring using playlist2vec
        dic_tag_score = {}
        if str(plylst_nid_id[pid]) in p2v_tag.wv.vocab:
            most_id = [
                x
                for x in p2v_tag.most_similar(str(plylst_nid_id[pid]), topn=50)
            ]
            for ID in most_id:
                for t in tag_dic[ID[0]]:
                    if t in dic_tag_score:
                        dic_tag_score[t] += ID[1]
                    else:
                        dic_tag_score[t] = ID[1]

        for k in dic_tag_score:
            cand_tag[tag_id_tid[k]] *= dic_tag_score[k]**0.5

        cand_tag_idx = cand_tag.reshape(-1).argsort()[-35:][::-1]

        # Tag prediction - 5. Filtering by score
        cand_tag_idx_filtered = []
        for cand in cand_tag_idx:
            if cand_tag[cand] > 0:
                cand_tag_idx_filtered.append(cand)
        if len(cand_tag_idx_filtered) != 35:
            filtered_lot_tag.append(len(cand_tag_idx_filtered))
        cand_tag_idx = np.array(cand_tag_idx_filtered)

        cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                    False][:30]
        rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

        res.append({
            "id": plylst_nid_id[pid],
            "songs": rec_song_idx,
            "tags": rec_tag_idx
        })

    print(len(filtered_lot_song), filtered_lot_song)
    print(len(filtered_lot_tag), filtered_lot_tag)

    write_json(res, "results/" + opt["results_path"])

    if opt["eval"]:
        evaluator = CustomEvaluator()
        evaluator.evaluate("arena_data/answers/val.json",
                           "arena_data/results/" + opt["results_path"])
    def __call__(self, docs, summaries):
        tfs = []
        df = OrderedDict()
        weights = []
        entities = []
        for doc in docs:
            tf = OrderedDict()
            token_found = set()
            doc_token = []
            for sent in doc[1]:
                sent = sent2tokens_wostop(sent, set(stopwords.words(LANGUAGE)), LANGUAGE)
                for token in sent:
                    if token in tf:
                        tf[token] += 1
                    else:
                        tf[token] = 1
                    if token not in token_found:
                        token_found.add(token)
                        if token in df:
                            df[token] += 1
                        else:
                            df[token] = 1
                    embedding = np.zeros(300)
                    try:
                        embedding += self.word_embedding[token]
                    except KeyError:
                        pass
                embedding /= len(embedding)
                weights.append(embedding)
                entities.append(str(len(entities)))
            tfs.append(tf)

        id2word = {i:word for i, word in enumerate(df.keys())}
        word2id = {id2word[id]:id for id in id2word.keys()}
        corpora = [[(word2id[token], tf[token]) for token in tf.keys()] for tf in tfs]
        self.doc_entities = []
        for i, tf in enumerate(tfs):
            divisor = sum([tf[token]/df[token] for token in tf.keys()])
            embedding = []
            for token in tf.keys():
                try:
                    embedding.append(self.word_embedding[token]*tf[token]/df[token])
                except KeyError:
                    pass
            embedding = np.sum(np.array(embedding), 0)/(len(embedding)*divisor)
            weights.append(embedding)
            entities.append('d'+str(i))
            self.doc_entities.append('d'+str(i))

        self.lda = LdaModel(corpus=corpora, num_topics=10, id2word=id2word, passes=10)

        self.topic_entities = []
        for i in range(10):
            topic_words = self.lda.show_topic(i, topn=30)
            embedding = []
            divisor = sum([w_p_pair[1]for w_p_pair in topic_words])
            for w_p_pair in topic_words:
                try:
                    embedding.append(self.word_embedding[w_p_pair[0]]*w_p_pair[1]/divisor)
                except KeyError:
                    pass
            embedding = np.sum(np.array(embedding), 0)/len(embedding)
            weights.append(embedding)
            entities.append('t'+str(i))
            self.topic_entities.append('t'+str(i))
        self.sent_embedding = WordEmbeddingsKeyedVectors(300)
        self.sent_embedding.add(entities, np.array(weights), replace=True)

        return self.distributional_semantic_similarity(summaries), self.topic_relevance(summaries), self.coherence(summaries)
Esempio n. 21
0
def run(total_concat, apply_data):
    total_concat['id'] = total_concat['id'].astype(str)
    c = Counter()
    for i in total_concat['tags']:
        c.update(i)
    tag_list = list(
        map(lambda y: y[0], (filter(lambda x: x[1] > 5, c.items()))))

    p = re.compile('|'.join(tag_list))

    total_concat['tag_in_title'] = total_concat['plylst_title'].apply(
        lambda x: p.findall(x))

    data = []
    for i in total_concat.index:
        temp = total_concat.loc[i]
        data.append({
            'id': temp['id'],
            'songs': temp['songs'],
            'tags': temp['tags'],
            'tag_in_title': temp['tag_in_title']
        })
    song_dic = {}
    tag_dic = {}
    for q in data:
        song_dic[q['id']] = q['songs']
        tag_dic[q['id']] = q['tags']
    total = list(
        map(
            lambda x: list(map(str, x['songs'])) + x['tags'] + x['tag_in_title'
                                                                 ], data))
    total = [x for x in total if len(x) > 1]

    print("start training item2Vec")
    size = 300
    if 'item2vec.model' in os.listdir():
        w2v_model = Word2Vec.load('item2vec.model')
    else:
        w2v_model = train.item2vec(total, size=size)
    print("done. \n")
    p2v_model = WordEmbeddingsKeyedVectors(size)
    ID = []
    vec = []
    for q in data:
        tmp_vec = 0
        for song in list(map(str, q['songs'])) + q['tags'] + q['tag_in_title']:
            try:
                tmp_vec += w2v_model.wv.get_vector(song)
            except KeyError:
                pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_model.add(ID, vec)

    with open("./arena_data/pre_tag.json", encoding="utf-8") as f:
        our_best = json.load(f)

    not_in = 0
    answers = []
    for i, q in enumerate(apply_data.index):
        q = apply_data.loc[q]
        try:
            most_id = [
                x[0] for x in p2v_model.most_similar(str(q['id']), topn=200)
            ]
            get_song = []
            get_tag = []
            for ID in most_id:
                get_song += song_dic[ID]
                get_tag += tag_dic[ID]
            get_song = list(pd.value_counts(get_song)[:300].index)
            get_tag = list(pd.value_counts(get_tag)[:30].index)

            output_song = remove_seen(q["songs"], get_song)[:100]
            output_tag = remove_seen(q["tags"], get_tag)[:10]

            answers.append({
                "id": q["id"],
                "songs": output_song,
                "tags": output_tag,
            })
        except KeyError:
            not_in += 1
            answers.append({
                "id": our_best[i]["id"],
                "songs": our_best[i]['songs'],
                "tags": our_best[i]["tags"],
            })

    for n, q in enumerate(answers):
        if len(q['songs']) != 100:
            answers[n]['songs'] += remove_seen(
                q['songs'], our_best[n]['songs'])[:100 - len(q['songs'])]
        if len(q['tags']) != 10:
            answers[n]['tags'] += remove_seen(
                q['tags'], our_best[n]['tags'])[:10 - len(q['tags'])]
    write_json(answers, 'final_tags.json')
    return answers
Esempio n. 22
0
class PlaylistEmbedding:
    # java의 생성자 같은 존재 __init__
    def __init__(self, FILE_PATH):
        self.FILE_PATH = FILE_PATH

        # word2vec의 요소들
        # 최소 1번 이상 연관이 있어야 학습한다.
        self.min_count = 2
        # 의미를 담을 벡터를 150차원으로 만든다.
        self.size = 150
        # 중심단어 기준으로 앞뒤로 210개 범위까지 학습시킨다.
        self.window = 210
        # sg = 1이면 skip-gram 아니면 CBOW
        self.sg = 1

        # 키 + 벡터를 저장함
        # KeyedVectors는 추가 교육을 지원하지 않는 대신 더 작고 RAM을 덜 차지한다.
        self.p2v_model = WordEmbeddingsKeyedVectors(self.size)

        # 유니코드 한글 시작: 44032, 끝:55199
        self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG = 44032, 588, 28

        # 초성 리스트0~18
        self.CHOSUNG_LIST = [
            'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ',
            'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
        ]

        # 중성 리스트 0~20
        self.JUNGSUNG_LIST = [
            'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ',
            'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
        ]

        # 종성 리스트 0~27
        self.JONGSUNG_LIST = [
            '', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ',
            'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ',
            'ㅍ', 'ㅎ'
        ]

        # 여기에 자모분리된 train의 플레이리스트 제목들이 담긴다.
        self.title_list_detach = []

        # FILE_PATH가 가리키는 곳에 train, test와 most_popular_res.json, song_meta.json이 있어야 합니다.
        with open(FILE_PATH + '/train.json', encoding="utf-8") as f:
            self.train = json.load(f)
        with open(FILE_PATH + '/test.json', encoding="utf-8") as f:
            self.val = json.load(f)
        with open(FILE_PATH + '/most_popular_res.json', encoding="utf-8") as f:
            self.most_results = json.load(f)
        # song_meta 데이터를 가져온다.
        with open(FILE_PATH + '/song_meta.json', encoding="utf-8") as f:
            self.song_meta = json.load(f)

    def write_json(self, data, fname):
        def _conv(o):
            if isinstance(o, (np.int64, np.int32)):
                return int(o)
            raise TypeError

        parent = os.path.dirname(fname)
        distutils.dir_util.mkpath("./arena_data/" + parent)
        with io.open("./arena_data/" + fname, "w", encoding="utf-8") as f:
            json_str = json.dumps(data, ensure_ascii=False, default=_conv)
            f.write(json_str)

    def remove_seen(self, seen, l):
        seen = set(seen)
        return [x for x in l if not (x in seen)]

    # train, val의 곡과 태그를 플레이리스트 id를 key값으로 가지는 딕셔너리에 저장
    def get_dic(self, train, val):
        song_dic = {}
        tag_dic = {}
        data = train + val
        for q in tqdm(data):
            song_dic[str(q['id'])] = q['songs']
            tag_dic[str(q['id'])] = q['tags']
        self.song_dic = song_dic
        self.tag_dic = tag_dic

        # 여기서 토탈로 train의 곡과 태그만 보내기 때문에 모델에는 train만 학습됨
        total = list(
            map(lambda x: list(map(str, x['songs'])) + list(x['tags']), data))
        total = [x for x in total if len(x) > 1]
        self.total = total

    def get_w2v(self, total, min_count, size, window, sg):
        try:
            print("get_w2v 실행")
            if not (os.path.isfile("./w2v_model.model")):
                print("get_w2v 모델 학습 시작")
                # window가 210인 이유는 태그 10개와 곡 200개 꽉차있는 플레이리스트도 존재하기 때문이다.
                w2v_model = Word2Vec(total,
                                     min_count=min_count,
                                     size=size,
                                     window=window,
                                     sg=sg,
                                     iter=25)
                print("get_w2v 모델 학습 완료")
                self.w2v_model = w2v_model
                w2v_model.save("w2v_model.model")
            print("w2v_model 모델 로드")
            self.w2v_model = Word2Vec.load("./w2v_model.model")
        except OSError as e:
            print("failed to create directory!")
            raise

    def update_p2v(self, train, val, w2v_model):
        ID = []
        vec = []
        # val에 있는 곡이나 태그들 중 train에는 없어서 예외처리되는 것을 확인하기 위한 카운트
        # 이 부분은 나중에 제거해도 상관 없음
        self.yes_songs_count = 0
        self.yes_tags_count = 0
        self.no_songs_count = 0
        self.no_tags_count = 0
        for q in tqdm(train + val):
            tmp_vec = 0
            songs_vec = 0
            tags_vec = 0
            if len(q['songs']) >= 1 or len(q['tags']) >= 1:
                for x in q['songs']:
                    try:
                        songs_vec += w2v_model.wv.get_vector(str(x))
                        self.yes_songs_count += 1
                    except:
                        self.no_songs_count += 1
                for y in q['tags']:
                    try:
                        tags_vec += w2v_model.wv.get_vector(str(y))
                        self.yes_tags_count += 1
                    except:
                        self.no_tags_count += 1
                tmp_vec = songs_vec + tags_vec
            if type(tmp_vec) != int:
                ID.append(str(q['id']))
                vec.append(tmp_vec)
        # train, val의 플레이리스트 id에 해당하는 vector값을 구함
        self.p2v_model.add(ID, vec)

        # FastText

    def get_title(self, train):
        title_list = []
        for q in train:
            title_list.append(q['plylst_title'])
        self.title_list = title_list

    def jamo_str(self, text, BASE_CODE, CHOSUNG, JUNGSUNG, CHOSUNG_LIST,
                 JUNGSUNG_LIST, JONGSUNG_LIST):
        def clean_str(text):
            pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'  # E-mail제거
            text = re.sub(pattern=pattern, repl='', string=text)
            pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'  # URL제거
            text = re.sub(pattern=pattern, repl='', string=text)
            pattern = '([ㄱ-ㅎㅏ-ㅣ]+)'  # 한글 자음, 모음 제거
            text = re.sub(pattern=pattern, repl=' ', string=text)
            pattern = '<[^>]*>'  # HTML 태그 제거
            text = re.sub(pattern=pattern, repl=' ', string=text)
            pattern = '[^\w\s]'  # 특수기호제거
            text = re.sub(pattern=pattern, repl=' ', string=text)
            return text

        string = text
        string = clean_str(string)
        # print(string)
        sp_list = list(string)
        # print(sp_list)

        result = []
        for keyword in sp_list:
            # 한글 여부 check 후 분리
            if re.match('.*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*', keyword) is not None:

                if keyword == ' ':
                    result.append(' ')

                if keyword in CHOSUNG_LIST or keyword in JUNGSUNG_LIST or keyword in JONGSUNG_LIST:
                    result.append('')

                else:
                    # 초성 ord->문자의 코드값을 구한다
                    char_code = ord(keyword) - BASE_CODE
                    char1 = int(char_code / CHOSUNG)
                    result.append(CHOSUNG_LIST[char1])

                    # 중성
                    char2 = int((char_code - (CHOSUNG * char1)) / JUNGSUNG)
                    result.append(JUNGSUNG_LIST[char2])

                    # 종성
                    char3 = int(
                        (char_code - (CHOSUNG * char1) - (JUNGSUNG * char2)))
                    if char3 == 0:
                        result.append('-')
                    result.append(JONGSUNG_LIST[char3])
            else:
                result.append(keyword)
        results_all = []
        results_all = ("".join(result))
        self.results_all = results_all

    def get_title_list(self, results_all):
        # print("".join(result)) #자모 분리 결과 출력
        title_list_detach = []
        title_list_detach.append(results_all)
        self.title_list_detach.append(title_list_detach)

    def make_title_model(self, title_list_detach):
        try:
            print("make_title_model 실행")
            if not (os.path.isfile("./FT_title_model.model")):
                print("make_title_model 모델 학습 시작")
                FT_title_model = FT_gensim(title_list_detach,
                                           size=300,
                                           window=100,
                                           min_count=1,
                                           sg=1,
                                           iter=2000)
                print("make_title_model2 모델 학습 완료")
                self.FT_title_model = FT_title_model
                FT_title_model.save("FT_title_model.model")
            self.FT_title_model = FT_gensim.load("./FT_title_model.model")
            print("make_title_model 모델 로드됨")
        except OSError as e:
            print("failed to create directory!")
            raise

    # Fasttext끝

    def get_result(self, p2v_model, song_dic, tag_dic, most_results, val,
                   train, FT_title_model, song_meta):
        title_sentence_train = []
        for x in train:
            self.jamo_str(x['plylst_title'], self.BASE_CODE, self.CHOSUNG,
                          self.JUNGSUNG, self.CHOSUNG_LIST, self.JUNGSUNG_LIST,
                          self.JONGSUNG_LIST)
            title_sentence_train.append(self.results_all)
        answers = []
        # 제대로 진행되고 있는지 알기 위해 세는 카운트
        # most_id는 제대로 뽑히고 있는가?
        self.most_id = []
        # ply_embedding 추천이 제대로된 플레이리스트는 몇개인가
        self.p2v_count = 0
        # 예외처리된 플레이리스트는 몇개인가
        self.except_count = 0
        # 어디서 끊기는지 정확히 알고 싶으면 옮기면서 카운트해보는 변수
        self.when_stop = 0

        # 문제유형별로 몇개의 플레이리스트가 있는 세는 카운트
        self.TNSN = 0
        self.TYSN = 0
        self.TNSY = 0
        self.TYSY = 0

        # 곡이나 태그가 100, 10개 안채워졌을 때 채우는 카운트
        self.update_song_count = 0
        self.update_tag_count = 0

        for n, q in tqdm(enumerate(val), total=len(val)):
            # 제목, 곡, 태그 유무 파악 및 개수 세기
            songs = q['songs']
            tags = q['tags']
            songs_count = len(songs)
            tags_count = len(tags)
            try:
                # 플레이리스트 임베딩하는 알고리즘(곡으로 곡추천할 때 씀)
                def ply_em(q):
                    most_id = [
                        x[0]
                        for x in p2v_model.most_similar(str(q['id']), topn=15)
                    ]
                    # most_vec = [x[1] for x in p2v_model.most_similar(str(q['id']), topn=15)]

                    # 원본
                    get_song = []
                    get_tag = []

                    for ID in most_id:
                        get_song += song_dic[ID]
                        get_tag += tag_dic[ID]

                    count = {}
                    for i in get_song:
                        try:
                            count[i] += 1
                        except:
                            count[i] = 1
                    count = sorted(count.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

                    count2 = {}
                    for i in get_tag:
                        try:
                            count2[i] += 1
                        except:
                            count2[i] = 1
                    count2 = sorted(count2.items(),
                                    key=lambda x: x[1],
                                    reverse=True)

                    self.when_stop += 1

                    real_get_song = []
                    real_get_tag = []

                    for song in count:
                        real_get_song.append(song[0])

                    for tag in count2:
                        real_get_tag.append(tag[0])

                    # get_song = list(pd.value_counts(get_song)[:500].index)
                    # get_tag = list(pd.value_counts(get_tag)[:20].index)

                    def to_integer(dt_time):
                        return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day

                    utc_time = datetime.strptime(q['updt_date'][:26],
                                                 '%Y-%m-%d %H:%M:%S.%f')
                    updt = int(to_integer(utc_time))
                    true_get_song = []
                    for song_id in real_get_song:
                        issue = int(song_meta[song_id]['issue_date'])
                        if updt - issue >= 0:
                            true_get_song.append(song_id)
                        else:
                            pass

                    answers.append({
                        "id":
                        q["id"],
                        "songs":
                        self.remove_seen(q["songs"], true_get_song)[:100],
                        "tags":
                        self.remove_seen(q["tags"], real_get_tag)[:10],
                    })
                    # 여기까지 오면 카운트 추가
                    self.p2v_count += 1

                    # FastText 알고리즘

                def fasttext_title(q):

                    train_ids = []
                    get_song = []
                    get_tag = []

                    self.jamo_str(q['plylst_title'], self.BASE_CODE,
                                  self.CHOSUNG, self.JUNGSUNG,
                                  self.CHOSUNG_LIST, self.JUNGSUNG_LIST,
                                  self.JONGSUNG_LIST)
                    title = self.results_all

                    F_list = FT_title_model.wv.most_similar(title, topn=60)
                    for x in F_list:
                        number = title_sentence_train.index(x[0])
                        train_ids.append(train[number]['id'])

                    for ids in train_ids:
                        get_song += song_dic[str(ids)]
                        get_tag += tag_dic[str(ids)]

                    count = {}
                    for i in get_song:
                        try:
                            count[i] += 1
                        except:
                            count[i] = 1
                    count = sorted(count.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

                    count2 = {}
                    for i in get_tag:
                        try:
                            count2[i] += 1
                        except:
                            count2[i] = 1
                    count2 = sorted(count2.items(),
                                    key=lambda x: x[1],
                                    reverse=True)

                    real_get_song = []
                    real_get_tag = []

                    for song in count:
                        real_get_song.append(song[0])

                    for tag in count2:
                        real_get_tag.append(tag[0])

                    # get_song = list(pd.value_counts(real_get_song)[:200].index)
                    # get_tag = list(pd.value_counts(real_get_tag)[:20].index)

                    def to_integer(dt_time):
                        return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day

                    utc_time = datetime.strptime(q['updt_date'][:26],
                                                 '%Y-%m-%d %H:%M:%S.%f')
                    updt = int(to_integer(utc_time))
                    true_get_song = []
                    for song_id in real_get_song:
                        issue = int(song_meta[song_id]['issue_date'])
                        if updt - issue >= 0:
                            true_get_song.append(song_id)
                        else:
                            pass

                    answers.append({
                        "id":
                        q["id"],
                        "songs":
                        self.remove_seen(q["songs"], true_get_song)[:100],
                        "tags":
                        self.remove_seen(q["tags"], real_get_tag)[:10],
                    })

                # 태그 X 곡 X 제목 O
                if tags_count == 0 and songs_count == 0:
                    self.TNSN += 1
                    fasttext_title(q)

                # 태그 O 곡 X 제목 X
                elif tags_count > 0 and songs_count == 0:
                    self.TYSN += 1
                    fasttext_title(q)

                # 태그 x 곡 O
                elif tags_count == 0 and songs_count > 0:
                    self.TNSY += 1
                    ply_em(q)

                # 태그 O 곡 O
                elif tags_count > 0 and songs_count > 0:
                    self.TYSY += 1
                    ply_em(q)

            except:
                # 예외처리되면 카운터 추가
                self.except_count += 1
                answers.append({
                    "id": q["id"],
                    "songs": most_results[n]['songs'],
                    "tags": most_results[n]["tags"],
                })

        # check and update answer
        for n, q in enumerate(answers):
            if len(q['songs']) != 100:
                answers[n]['songs'] += self.remove_seen(
                    q['songs'],
                    self.most_results[n]['songs'])[:100 - len(q['songs'])]
                self.update_song_count += 1
            if len(q['tags']) != 10:
                answers[n]['tags'] += self.remove_seen(
                    q['tags'],
                    self.most_results[n]['tags'])[:10 - len(q['tags'])]
                self.update_tag_count += 1
        self.answers = answers

    def run(self):
        # Word2Vec ply_embedding
        self.get_dic(self.train, self.val)
        self.get_w2v(self.total, self.min_count, self.size, self.window,
                     self.sg)
        self.update_p2v(self.train, self.val, self.w2v_model)

        # FastText ply_title
        self.get_title(self.train)
        for string in self.title_list:
            self.jamo_str(string, self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG,
                          self.CHOSUNG_LIST, self.JUNGSUNG_LIST,
                          self.JONGSUNG_LIST)
            self.get_title_list(self.results_all)
        self.make_title_model(self.title_list_detach)

        # 곡과 태그 채우는 함수
        self.get_result(self.p2v_model, self.song_dic, self.tag_dic,
                        self.most_results, self.val, self.train,
                        self.FT_title_model, self.song_meta)

        self.write_json(self.answers, 'results.json')
        print("results 작성 완료")

    def train_model(self):
        # Word2Vec ply_embedding
        self.get_dic(self.train, self.val)
        self.get_w2v(self.total, self.min_count, self.size, self.window,
                     self.sg)
        self.update_p2v(self.train, self.val, self.w2v_model)

        # FastText ply_title
        self.get_title(self.train)
        for string in self.title_list:
            self.jamo_str(string, self.BASE_CODE, self.CHOSUNG, self.JUNGSUNG,
                          self.CHOSUNG_LIST, self.JUNGSUNG_LIST,
                          self.JONGSUNG_LIST)
            self.get_title_list(self.results_all)
        self.make_title_model(self.title_list_detach)
Esempio n. 23
0
def train():
    MODE = "Test"
    if MODE == "Valid":
        train = load_json("arena_data/orig/train.json") + load_json(
            "arena_data/questions/val.json")
        dev = load_json("res/val.json")
        test = load_json("res/test.json")
    else:
        train = load_json("res/train.json")
        dev = load_json("res/val.json")
        test = load_json("res/test.json")

    pred_tag = load_json("arena_data/model/pred_tag.json")
    dic_pred_tag = {}
    for p_t in pred_tag:
        dic_pred_tag[p_t['id']] = p_t['predict_tag']

    for doc in train:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    for doc in dev:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    for doc in test:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    item_list = []
    len_item = []

    for doc in train + dev + test:
        song_list = []
        for i in doc['songs']:
            song_list.append(str(i))
        item_list.append(song_list + doc['tags'])
        len_item.append(len(song_list + doc['tags']))
    print("Max length of item list :", max(len_item), ", Min :", min(len_item))
    item_list = [x for x in item_list if len(x) > 1]
    print("Train set :", len(item_list))

    print("Training Item2Vec model")
    SIZE = 100
    model = Word2Vec(sentences=item_list,
                     size=SIZE,
                     window=240,
                     min_count=2,
                     sg=1,
                     workers=8,
                     iter=10,
                     negative=7,
                     compute_loss=True,
                     callbacks=[LossPrinter()])
    model.save("arena_data/model/word2vec.model")
    print("Vocab : ", len(model.wv.vocab))

    print("Building and saving playlist embeddings")
    song_dic = {}
    tag_dic = {}
    for q in tqdm(train + test + dev):
        song_dic[str(q['id'])] = q['songs']
        tag_dic[str(q['id'])] = q['tags_org']

    p2v_song = WordEmbeddingsKeyedVectors(SIZE)
    ID = []
    vec = []
    for q in tqdm(train + test + dev):
        tmp_vec = 0
        cnt_vocab = 0
        if len(q['songs']) >= 1:
            for item in q['songs']:
                try:
                    tmp_vec += model.wv.get_vector(str(item)) * 2
                    cnt_vocab += 1
                except KeyError:
                    pass
        if len(q['tags']) >= 1:
            for item in q['tags']:
                try:
                    tmp_vec += model.wv.get_vector(str(item))
                    cnt_vocab += 1
                except KeyError:
                    pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_song.add(ID, vec)
    p2v_song.save("arena_data/model/p2v_song.model")

    p2v_tag = WordEmbeddingsKeyedVectors(SIZE)
    ID = []
    vec = []
    for q in tqdm(train + test + dev):
        tmp_vec = 0
        cnt_vocab = 0
        if len(q['songs']) >= 1:
            for item in q['songs']:
                try:
                    tmp_vec += model.wv.get_vector(str(item))
                    cnt_vocab += 1
                except KeyError:
                    pass
        if len(q['tags']) >= 1:
            for item in q['tags']:
                try:
                    tmp_vec += model.wv.get_vector(str(item)) * 2
                    cnt_vocab += 1
                except KeyError:
                    pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_tag.add(ID, vec)
    p2v_tag.save("arena_data/model/p2v_tag.model")

    if MODE == "Valid":
        print("Testing")
        questions = load_json("arena_data/questions/val.json")
        cnt_wv_song = 0
        cnt_wv_tag = 0
        res = []
        for q in tqdm(questions):
            dic_song_score = {}
            dic_tag_score = {}

            song_result = []
            tag_result = []

            if str(q['id']) in p2v_song.wv.vocab:
                most_id = [
                    x for x in p2v_song.most_similar(str(q['id']), topn=50)
                ]
                for ID in most_id:
                    for s in song_dic[ID[0]]:
                        if s in dic_song_score:
                            dic_song_score[s] += ID[1]
                        else:
                            dic_song_score[s] = ID[1]

            if str(q['id']) in p2v_tag.wv.vocab:
                most_id = [
                    x for x in p2v_tag.most_similar(str(q['id']), topn=50)
                ]
                for t in tag_dic[ID[0]]:
                    if t in dic_tag_score:
                        dic_tag_score[t] += ID[1]
                    else:
                        dic_tag_score[t] = ID[1]

            if len(dic_song_score) > 0:
                sort_song_score = sorted(dic_song_score.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

                for s in sort_song_score:
                    song_result.append(s[0])
                cnt_wv_song += 1

            if len(dic_tag_score) > 0:
                sort_tag_score = sorted(dic_tag_score.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

                for s in sort_tag_score:
                    tag_result.append(s[0])
                cnt_wv_tag += 1

            res.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_result)[:100],
                "tags": remove_seen(q["tags"], tag_result)[:10],
            })

        print(len(questions), cnt_wv_song, cnt_wv_tag)

        ans = load_json("arena_data/answers/val.json")
        evaluator = CustomEvaluator()
        evaluator._evaluate(ans, res)
Esempio n. 24
0
w2v_model = Word2Vec(tot_repr,
                     min_count=min_count,
                     size=size,
                     window=window,
                     sg=sg,
                     workers=8,
                     hashfxn=hash)

#%% save the w2v model
with open(os.path.join(MODEL_PATH, '0007ky.w2v'), 'wb') as f:
    pickle.dump(w2v_model, f)

# In[ ]:

# make p2v model
p2v_model = WordEmbeddingsKeyedVectors(size)
#%%
tot = train + test + val
song_dic = {}
tag_dic = {}
for q in tqdm(tot):
    song_dic[q['id']] = q['songs']
    tag_dic[q['id']] = q['tags']

#%%
ID = []
vec = []
for q in tqdm(tot, leave=True, position=0):
    tmp_vec = 0
    if len(q['repr']) >= 1:
        for word in q['repr']: