Beispiel #1
0
def compute_scaled_average_sim(early, later):
    M_early = WordEmbeddingsKeyedVectors(300)
    M_early.add(words, early.to_numpy())
    M_later = WordEmbeddingsKeyedVectors(300)
    M_later.add(words, later.to_numpy())

    scores = list()
    for word in words:
        score = 0
        early_values = M_early.most_similar(word, topn=20)
        later_values = M_later.most_similar(word, topn=20)
        early_dict = {v[0]:v[1] for v in early_values}
        later_dict = {v[0]:v[1] for v in later_values}
        overlap = set([w for w in early_dict.keys() if w in later_dict])
        early_avg = 0
        later_avg = 0
        for entry in overlap:
            early_avg += early_dict[entry]
            later_avg += later_dict[entry]
        early_avg = early_avg / len(overlap) if len(overlap) else 0
        later_avg = later_avg / len(overlap) if len(overlap) else 0
        scores.append(len(overlap) + (1 - abs(later_avg - early_avg)))
    return scores
Beispiel #2
0
def getRecos(liked, disliked, watched, threshold=1000):

    liked = [str(l) for l in liked]
    disliked = [str(l) for l in disliked]

    watched = [str(w) for w in watched if str(
        w) not in liked and str(w) not in disliked]

    df_restr = df_movies[~df_movies["movieId"].isin(watched)].sort_values(
        by="count", ascending=False)

    kv = WordEmbeddingsKeyedVectors(movie_embedding_size)
    kv.add(
        df_restr['movieId'].apply(str).values,
        w[df_restr.movieId]
    )

    idlist = [int(x[0])
              for x in kv.most_similar(positive=liked, negative=disliked, restrict_vocab=4000, topn=12)]

    return getAll(idlist=idlist)
Beispiel #3
0
def createKVs(DFFile, COFIle, type):
    #createWordandVectorList() # to create  word and vector list for wiki 50

    # wordList - list of words
    # vectorList - list of the vector corresponding to the words

    wordListW2V, vectorListW2V = loadWordANdVectorsW2V()
    wordListPCA, vectorListPCA = loadWordANdVectorsPCA(DFFile, COFIle)

    w2v_len = 50
    PCA_len = 10
    kv1 = WordEmbeddingsKeyedVectors(w2v_len)
    kv2 = WordEmbeddingsKeyedVectors(PCA_len)

    kv1.add(wordListW2V, list(vectorListW2V))
    kv2.add(wordListPCA, vectorListPCA)
    filename = 'KV' + type + '.obj'
    with open(filename, "wb") as f:
        pickle.dump(kv1, f)
        pickle.dump(kv2, f)
    print(kv1.most_similar('love'))  # gives the list of words similar to word1
    return filename
Beispiel #4
0
def make_prediction(favofite_movie):
    """
    Input:
    feature_dict: a dictionary of the form {"feature_name": "value"}
    Function makes sure the features are fed to the model in the same order the
    model expects them.
    Output:
    Returns (x_inputs, probs) where
      x_inputs: a list of feature values in the order they appear in the model
      probs: a list of dictionaries with keys 'name', 'prob'
    """

    movie = favofite_movie

    threshold = 100
    mainstream_movies = movies_df[
        movies_df.n_ratings >= threshold].reset_index(drop=True)

    movie_embedding_size = w.shape[1]
    kv = WordEmbeddingsKeyedVectors(movie_embedding_size)
    kv.add(mainstream_movies['key'].values, w[mainstream_movies.movieId])

    results = kv.most_similar(movie)
    return [result[0] for result in results]
Beispiel #5
0
def run(song_meta_data, train_data, test_data):
    train_data['updt_year'] = train_data['updt_date'].str.slice(start=0,
                                                                stop=4)
    test_data['updt_year'] = test_data['updt_date'].str.slice(start=0, stop=4)
    song_meta_data['issue_year'] = song_meta_data['issue_date'].str.slice(
        start=0, stop=4)
    song_meta_data['id'] = song_meta_data['id'].astype(str)

    print("Tokenize...")
    tokenize(train_data, test_data)

    train_data = train_data.sort_values(by='updt_date').reset_index(drop=True)
    test_data = test_data.sort_values(by='updt_date').reset_index(drop=True)
    print("Total Dict Loading")
    if os.path.exists(
            BASE_DIR + 'model/total_data_final.pickle') and os.path.exists(
                BASE_DIR + 'model/song_dict_final.pickle') and os.path.exists(
                    BASE_DIR +
                    'model/tag_dict_final.pickle') and os.path.exists(
                        BASE_DIR + 'model/title_dict_final.pickle'):
        with open(BASE_DIR + 'model/total_data_final.pickle', 'rb') as handle:
            total_data = pickle.load(handle)
        with open(BASE_DIR + 'model/song_dict_final.pickle', 'rb') as handle:
            song_dict = pickle.load(handle)
        with open(BASE_DIR + 'model/tag_dict_final.pickle', 'rb') as handle:
            tag_dict = pickle.load(handle)
        with open(BASE_DIR + 'model/title_dict_final.pickle', 'rb') as handle:
            title_dict = pickle.load(handle)
    else:
        print("Total Dict Not Existing... Calculating")
        total_data, song_dict, tag_dict, title_dict = getTotalDict(
            train_data, test_data)
        with open(BASE_DIR + 'model/total_data_final.pickle', 'wb') as handle:
            pickle.dump(total_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/song_dict_final.pickle', 'wb') as handle:
            pickle.dump(song_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/tag_dict_final.pickle', 'wb') as handle:
            pickle.dump(tag_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/title_dict_final.pickle', 'wb') as handle:
            pickle.dump(title_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Frequency Loading...")
    if os.path.exists(BASE_DIR +
                      'model/tag_freq_by_song.pickle') and os.path.exists(
                          BASE_DIR + 'model/song_freq_by_tag.pickle'):
        with open(BASE_DIR + 'model/tag_freq_by_song.pickle', 'rb') as handle:
            tag_freq_by_song = pickle.load(handle)
        with open(BASE_DIR + 'model/song_freq_by_tag.pickle', 'rb') as handle:
            song_freq_by_tag = pickle.load(handle)
    else:
        print("Frequency Not Existing... Calculating")
        tag_freq_by_song, song_freq_by_tag = getFreqDict(train_data)
        with open(BASE_DIR + 'model/tag_freq_by_song.pickle', 'wb') as handle:
            pickle.dump(tag_freq_by_song,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/song_freq_by_tag.pickle', 'wb') as handle:
            pickle.dump(song_freq_by_tag,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

    print("Update Date Loading...")
    if os.path.exists(BASE_DIR + 'model/updt_dict.pickle'):
        with open(BASE_DIR + 'model/updt_dict.pickle', 'rb') as handle:
            updt_dict = pickle.load(handle)
    else:
        print("Update Date Not Existing... Calculating")
        updt_dict = getUpdtDict(song_meta_data)
        with open(BASE_DIR + 'model/updt_dict.pickle', 'wb') as handle:
            pickle.dump(updt_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Song Popularity Loading...")
    if os.path.exists(BASE_DIR +
                      'model/popular_song_by_year.pickle') and os.path.exists(
                          BASE_DIR + 'model/popular_tag_by_year.pickle'):
        with open(BASE_DIR + 'model/popular_tag_by_year.pickle',
                  'rb') as handle:
            popular_tag_by_year = pickle.load(handle)
        with open(BASE_DIR + 'model/popular_song_by_year.pickle',
                  'rb') as handle:
            popular_song_by_year = pickle.load(handle)
    else:
        print("Song Popularity Not Existing... Calculating")
        popular_song_by_year, popular_tag_by_year = getPopularDict(train_data)
        with open(BASE_DIR + 'model/popular_tag_by_year.pickle',
                  'wb') as handle:
            pickle.dump(popular_tag_by_year,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
        with open(BASE_DIR + 'model/popular_song_by_year.pickle',
                  'wb') as handle:
            pickle.dump(popular_song_by_year,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

    print("Word2Vec Model Loading...")
    if os.path.exists(BASE_DIR + 'model/w2v_model_sg_title.model'):
        w2v_model = Word2Vec.load(BASE_DIR + 'model/w2v_model_sg_title.model')
    else:
        print("Word2Vec Model Not Found !")
        print("Training...")
        w2v_model = Word2Vec(total_data,
                             min_count=3,
                             size=100,
                             window=210,
                             sg=1)
        w2v_model.save(BASE_DIR + 'model/w2v_model_sg_title.model')

    print("Training...")
    p2v_model = WordEmbeddingsKeyedVectors(100)
    updateP2V(train_data, test_data, w2v_model, p2v_model, song_dict, tag_dict,
              title_dict)

    print("Word2Vec Second Model Loading...")
    if os.path.exists(BASE_DIR +
                      'model/w2v_tag_final.model') and os.path.exists(
                          BASE_DIR + 'model/w2v_song_final.model'):
        tag_model = Word2Vec.load(BASE_DIR + 'model/w2v_tag_final.model')
        song_model = Word2Vec.load(BASE_DIR + 'model/w2v_song_final.model')
        mt = W2VModel(tag_model, "tags")
        ms = W2VModel(song_model, "songs")
    else:
        print("Word2Vec Second Model Not Found !")
        print("Tag Training...")
        mt = W2VModel(pd.concat([train_data, test_data]), "tags")
        mt.model.save(BASE_DIR + 'model/w2v_tag_final.model')
        print("Song Training...")
        ms = W2VModel(pd.concat([train_data, test_data]), "songs")
        ms.model.save(BASE_DIR + 'model/w2v_song_final.model')

    print("start")
    answer = []
    for i, row in tqdm(test_data.iterrows()):
        year = str(row['updt_year'])
        id = str(row['id'])
        songs = []
        tags = []
        try:
            most_id_list = [x[0] for x in p2v_model.most_similar(id, topn=200)]
            fillAnswer(getItemById(most_id_list, song_dict, 200), songs, 100,
                       song_dict, id, updt_dict, year)
            fillAnswer(getItemById(most_id_list, tag_dict, 20), tags, 10,
                       tag_dict, id)
        except:
            pass

        if len(songs) < 100:
            fillAnswer(ms.recommand(test_data, int(row['id']), 200), songs,
                       100, song_dict, id, updt_dict, year)

        if len(tags) < 10:
            fillAnswer(mt.recommand(test_data, int(row['id']), 20), tags, 10,
                       tag_dict, id)

        if len(songs) < 100:
            fillAnswer(getSongByTagFreq(song_freq_by_tag, row['tags'], 200),
                       songs, 100, song_dict, id, updt_dict, year)
        if len(tags) < 10:
            fillAnswer(getTagBySongFreq(tag_freq_by_song, row['songs'], 20),
                       tags, 10, tag_dict, id)

        if len(songs) < 100:
            fillAnswer(getSongByYear(popular_song_by_year, year, 200), songs,
                       100, song_dict, id, updt_dict, year)
        if len(tags) < 10:
            fillAnswer(getTagByYear(popular_tag_by_year, year, 20), tags, 10,
                       tag_dict, id)

        if len(songs) < 100:
            try:
                fillAnswer(
                    getSongByYear(popular_song_by_year, str(int(year) - 1),
                                  20), songs, 100, song_dict, id, updt_dict,
                    year)
            except:
                fillAnswer(
                    getSongByYear(popular_song_by_year, str(int(year) + 1),
                                  200), songs, 100, song_dict, id, updt_dict,
                    year)
        if len(tags) < 10:
            try:
                fillAnswer(
                    getTagByYear(popular_tag_by_year, str(int(year) - 1), 20),
                    tags, 10, tag_dict, id)
            except:
                fillAnswer(
                    getTagByYear(popular_tag_by_year, str(int(year) + 1), 200),
                    tags, 10, tag_dict, id)

        if len(songs) < 100:
            print("song 의 개수가 적습니다. id : ", str(row['id']), str(year))
        if len(tags) < 10:
            print("tag 의 개수가 적습니다. id : ", str(row['id']), str(year))

        answer.append({"id": row["id"], "songs": songs, "tags": tags})

    write_json(answer, "results.json")
Beispiel #6
0
def train():
    MODE = "Test"
    if MODE == "Valid":
        train = load_json("arena_data/orig/train.json") + load_json(
            "arena_data/questions/val.json")
        dev = load_json("res/val.json")
        test = load_json("res/test.json")
    else:
        train = load_json("res/train.json")
        dev = load_json("res/val.json")
        test = load_json("res/test.json")

    pred_tag = load_json("arena_data/model/pred_tag.json")
    dic_pred_tag = {}
    for p_t in pred_tag:
        dic_pred_tag[p_t['id']] = p_t['predict_tag']

    for doc in train:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    for doc in dev:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    for doc in test:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    item_list = []
    len_item = []

    for doc in train + dev + test:
        song_list = []
        for i in doc['songs']:
            song_list.append(str(i))
        item_list.append(song_list + doc['tags'])
        len_item.append(len(song_list + doc['tags']))
    print("Max length of item list :", max(len_item), ", Min :", min(len_item))
    item_list = [x for x in item_list if len(x) > 1]
    print("Train set :", len(item_list))

    print("Training Item2Vec model")
    SIZE = 100
    model = Word2Vec(sentences=item_list,
                     size=SIZE,
                     window=240,
                     min_count=2,
                     sg=1,
                     workers=8,
                     iter=10,
                     negative=7,
                     compute_loss=True,
                     callbacks=[LossPrinter()])
    model.save("arena_data/model/word2vec.model")
    print("Vocab : ", len(model.wv.vocab))

    print("Building and saving playlist embeddings")
    song_dic = {}
    tag_dic = {}
    for q in tqdm(train + test + dev):
        song_dic[str(q['id'])] = q['songs']
        tag_dic[str(q['id'])] = q['tags_org']

    p2v_song = WordEmbeddingsKeyedVectors(SIZE)
    ID = []
    vec = []
    for q in tqdm(train + test + dev):
        tmp_vec = 0
        cnt_vocab = 0
        if len(q['songs']) >= 1:
            for item in q['songs']:
                try:
                    tmp_vec += model.wv.get_vector(str(item)) * 2
                    cnt_vocab += 1
                except KeyError:
                    pass
        if len(q['tags']) >= 1:
            for item in q['tags']:
                try:
                    tmp_vec += model.wv.get_vector(str(item))
                    cnt_vocab += 1
                except KeyError:
                    pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_song.add(ID, vec)
    p2v_song.save("arena_data/model/p2v_song.model")

    p2v_tag = WordEmbeddingsKeyedVectors(SIZE)
    ID = []
    vec = []
    for q in tqdm(train + test + dev):
        tmp_vec = 0
        cnt_vocab = 0
        if len(q['songs']) >= 1:
            for item in q['songs']:
                try:
                    tmp_vec += model.wv.get_vector(str(item))
                    cnt_vocab += 1
                except KeyError:
                    pass
        if len(q['tags']) >= 1:
            for item in q['tags']:
                try:
                    tmp_vec += model.wv.get_vector(str(item)) * 2
                    cnt_vocab += 1
                except KeyError:
                    pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_tag.add(ID, vec)
    p2v_tag.save("arena_data/model/p2v_tag.model")

    if MODE == "Valid":
        print("Testing")
        questions = load_json("arena_data/questions/val.json")
        cnt_wv_song = 0
        cnt_wv_tag = 0
        res = []
        for q in tqdm(questions):
            dic_song_score = {}
            dic_tag_score = {}

            song_result = []
            tag_result = []

            if str(q['id']) in p2v_song.wv.vocab:
                most_id = [
                    x for x in p2v_song.most_similar(str(q['id']), topn=50)
                ]
                for ID in most_id:
                    for s in song_dic[ID[0]]:
                        if s in dic_song_score:
                            dic_song_score[s] += ID[1]
                        else:
                            dic_song_score[s] = ID[1]

            if str(q['id']) in p2v_tag.wv.vocab:
                most_id = [
                    x for x in p2v_tag.most_similar(str(q['id']), topn=50)
                ]
                for t in tag_dic[ID[0]]:
                    if t in dic_tag_score:
                        dic_tag_score[t] += ID[1]
                    else:
                        dic_tag_score[t] = ID[1]

            if len(dic_song_score) > 0:
                sort_song_score = sorted(dic_song_score.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

                for s in sort_song_score:
                    song_result.append(s[0])
                cnt_wv_song += 1

            if len(dic_tag_score) > 0:
                sort_tag_score = sorted(dic_tag_score.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

                for s in sort_tag_score:
                    tag_result.append(s[0])
                cnt_wv_tag += 1

            res.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_result)[:100],
                "tags": remove_seen(q["tags"], tag_result)[:10],
            })

        print(len(questions), cnt_wv_song, cnt_wv_tag)

        ans = load_json("arena_data/answers/val.json")
        evaluator = CustomEvaluator()
        evaluator._evaluate(ans, res)
Beispiel #7
0
def run(total_concat, apply_data):
    total_concat['id'] = total_concat['id'].astype(str)
    c = Counter()
    for i in total_concat['tags']:
        c.update(i)
    tag_list = list(
        map(lambda y: y[0], (filter(lambda x: x[1] > 5, c.items()))))

    p = re.compile('|'.join(tag_list))

    total_concat['tag_in_title'] = total_concat['plylst_title'].apply(
        lambda x: p.findall(x))

    data = []
    for i in total_concat.index:
        temp = total_concat.loc[i]
        data.append({
            'id': temp['id'],
            'songs': temp['songs'],
            'tags': temp['tags'],
            'tag_in_title': temp['tag_in_title']
        })
    song_dic = {}
    tag_dic = {}
    for q in data:
        song_dic[q['id']] = q['songs']
        tag_dic[q['id']] = q['tags']
    total = list(
        map(
            lambda x: list(map(str, x['songs'])) + x['tags'] + x['tag_in_title'
                                                                 ], data))
    total = [x for x in total if len(x) > 1]

    print("start training item2Vec")
    size = 300
    if 'item2vec.model' in os.listdir():
        w2v_model = Word2Vec.load('item2vec.model')
    else:
        w2v_model = train.item2vec(total, size=size)
    print("done. \n")
    p2v_model = WordEmbeddingsKeyedVectors(size)
    ID = []
    vec = []
    for q in data:
        tmp_vec = 0
        for song in list(map(str, q['songs'])) + q['tags'] + q['tag_in_title']:
            try:
                tmp_vec += w2v_model.wv.get_vector(song)
            except KeyError:
                pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_model.add(ID, vec)

    with open("./arena_data/pre_tag.json", encoding="utf-8") as f:
        our_best = json.load(f)

    not_in = 0
    answers = []
    for i, q in enumerate(apply_data.index):
        q = apply_data.loc[q]
        try:
            most_id = [
                x[0] for x in p2v_model.most_similar(str(q['id']), topn=200)
            ]
            get_song = []
            get_tag = []
            for ID in most_id:
                get_song += song_dic[ID]
                get_tag += tag_dic[ID]
            get_song = list(pd.value_counts(get_song)[:300].index)
            get_tag = list(pd.value_counts(get_tag)[:30].index)

            output_song = remove_seen(q["songs"], get_song)[:100]
            output_tag = remove_seen(q["tags"], get_tag)[:10]

            answers.append({
                "id": q["id"],
                "songs": output_song,
                "tags": output_tag,
            })
        except KeyError:
            not_in += 1
            answers.append({
                "id": our_best[i]["id"],
                "songs": our_best[i]['songs'],
                "tags": our_best[i]["tags"],
            })

    for n, q in enumerate(answers):
        if len(q['songs']) != 100:
            answers[n]['songs'] += remove_seen(
                q['songs'], our_best[n]['songs'])[:100 - len(q['songs'])]
        if len(q['tags']) != 10:
            answers[n]['tags'] += remove_seen(
                q['tags'], our_best[n]['tags'])[:10 - len(q['tags'])]
    write_json(answers, 'final_tags.json')
    return answers