Exemple #1
0
def get_X(track_id,
          use_mp3,
          use_lyrics,
          use_artist,
          lyrics_d2v_model,
          d_artist_vec,
          music_datatype="mfcc"):
    conn = MyConn()

    rawmusic_path, vggish_embed_path, lyrics_path, artist = conn.query(
        table="sub_tracks",
        conditions={"track_id": track_id},
        fetchall=False,
        targets=[
            "rawmusic_path", "vggish_embed_path", "lyrics_path", "artist"
        ])

    vecs = []
    if use_mp3:
        if music_datatype == "mfcc":
            music_vec = get_mfcc(rawmusic_path).ravel()
        elif music_datatype == "vggish":
            with open(vggish_embed_path, "rb") as f:
                music_vec = pickle.load(f).detach().numpy()
        vecs.append(music_vec)
    if use_lyrics:
        lyrics_vec = get_d2v_vector(lyrics_path, lyrics_d2v_model)
        vecs.append(lyrics_vec)
    if use_artist:
        artist_vec = d_artist_vec[artist.lower().strip()]
        vecs.append(artist_vec)

    features_vec = concatenate_features(vecs)

    return features_vec
Exemple #2
0
def get_reviews_topk_words(track_id, is_breakout, key):
    conn = MyConn()
    if key == "w_fake":
        col = "feature_words"
    elif key == "wo_fake":
        col = "feature_words_wo_fake"
    elif key == "tfidf":
        col = "feature_words_tfidf"
    elif key == "candidates":
        col = "feature_words_candidates"

    if is_breakout == 1:
        bids = [
            r[0] for r in conn.query(
                sql=
                "SELECT id FROM breakouts WHERE is_valid=1 and simi_score>=0.5 and track_id={}"
                .format(track_id))
        ]
        for bid in bids:
            feature_words = conn.query(
                sql="SELECT {} FROM breakouts_feature_words WHERE id='{}'".
                format(col, bid))
            if feature_words and feature_words[0][0]:
                break
    else:
        feature_words = conn.query(
            sql="SELECT {} FROM no_breakouts_feature_words WHERE track_id={}".
            format(col, track_id))

    if len(feature_words) > 0:
        feature_words = feature_words[0][0].split()
    return feature_words
Exemple #3
0
def rubbish_tags():
    '''
    + 统计每个爆发点关键词中rubbish_tags的个数
    + 将垃圾标签占比大的样本点作为噪声筛去
    + 将关键词中的垃圾标签删除并上传至数据库
    '''

    rubbish = open("../resources/rubbish_tags.txt").read().splitlines()
    conn = MyConn()

    records = []
    for res in conn.query(targets=["id", "feature_words"],
                          table="breakouts_feature_words_c3"):
        # if conn.query(table="breakouts", targets=["release_drive"], fetchall=False, conditions={"id": res[0]})[0] == 1:
        #     continue
        feature_words = res[1].split()
        rubbish_count = 0
        for w in feature_words:
            if w in rubbish:
                rubbish_count += 1
        records.append([res[0], rubbish_count, feature_words])

    records.sort(key=lambda x: x[1], reverse=True)
    for r in records:
        print(r)
Exemple #4
0
def create_artists_table():
    '''
    在database中创建表格artists,包含id, nid, name。
    '''
    read_path = "/Volumes/nmusic/NetEase2020/data/simple_proxied_tracks_details"
    artists_set = set()
    conn = MyConn()

    for root, dirs, files in os.walk(read_path):
        for file in files:
            if "DS" in file: continue
            filepath = os.path.join(root, file)
            with open(filepath) as f:
                content = json.load(f)
            try:
                for ar in content["songs"][0]["ar"]:
                    artists_set.add((ar["id"], ar["name"]))
            except KeyboardInterrupt:
                print("interrupted by keyboard.")
                sys.exit(0)
            except Exception as e:
                print(filepath, e)


    print(len(artists_set))
    for ar in artists_set:
        conn.insert(table="artists", settings={"nid":ar[0], "name":ar[1]})
Exemple #5
0
def check_breakouts():
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/c3.mod")
    tracks = conn.query(
        sql="SELECT track_id, json_path FROM sub_tracks WHERE bnum>0")

    for track_id, filepath in tracks[70:]:
        d_reviews_partitions = get_reviews_partitions(filepath,
                                                      w2v_model,
                                                      merge_num=2)
        # print(d_reviews_partitions)
        breakouts = conn.query(table="breakouts",
                               targets=["date", "reviews_num"],
                               conditions={
                                   "track_id": track_id,
                                   "release_drive": 0,
                                   "fake": 0,
                                   "capital_drive": 0
                               })
        if not breakouts: continue

        d_bcount = dict(
            zip(d_reviews_partitions.keys(), [0] * len(d_reviews_partitions)))
        for dt, reviews_num in breakouts:
            date = datetime.strftime(dt, '%Y-%m-%d')
            for k in d_reviews_partitions:
                if k[0] <= date and date <= k[1]:
                    d_bcount[k] += 1
                    break
        print(track_id)
        for k, v in d_bcount.items():
            if v > 0:
                print("{} - {}: {} [count: {}]".format(k[0], k[1],
                                                       d_reviews_partitions[k],
                                                       d_bcount[k]))
Exemple #6
0
def check_feature_words():
    conn = MyConn()
    breakouts_feature_words = Counter()
    res = [
        r[0].split() for r in conn.query(targets=["feature_words"],
                                         table="breakouts_feature_words_1")
    ]
    for r in res:
        breakouts_feature_words.update(r)

    valid_breakouts_feature_words = [
        p[0] for p in filter(lambda x: x[1] >= 30,
                             breakouts_feature_words.most_common())
    ]

    # no_breakouts_feature_words = Counter()
    # res = [r[0].split() for r in conn.query(targets=["feature_words"], table="no_breakouts_feature_words_1")]
    # for r in res:
    #   no_breakouts_feature_words.update(r)

    # valid_no_breakouts_feature_words = [p[0] for p in filter(lambda x:x[1]>=30, no_breakouts_feature_words.most_common())]

    intersection = set(valid_breakouts_feature_words).intersection(
        set(valid_no_breakouts_feature_words))
    print("intersection:\n", intersection)
    print("breakouts_unique:\n",
          set(valid_breakouts_feature_words) - intersection)
    print("no_breakouts_unique:\n",
          set(valid_no_breakouts_feature_words) - intersection)
Exemple #7
0
def in_tags_analysis(breakouts_set, no_breakouts_set):
    '''
    对指定的歌曲集的内置tags情况进行分析。
    '''
    tags = open("../data/metadata/自带tags.txt").read().splitlines()
    breakouts_tags_d = {}
    no_breakouts_tags_d = {}
    for t in tags:
        breakouts_tags_d[t] = []
        no_breakouts_tags_d[t] = []

    conn = MyConn()
    for tid in breakouts_set:
        res = conn.query(targets=["tags"], conditions={"track_id":tid})[0]
        for t in res[0].split():
            breakouts_tags_d[t].append(tid)
    for tid in no_breakouts_set:
        res = conn.query(targets=["tags"], conditions={"track_id":tid})[0]
        for t in res[0].split():
            no_breakouts_tags_d[t].append(tid)

    tags_count = []
    for k in breakouts_tags_d:
        tags_count.append((k, (float(format(len(breakouts_tags_d[k])/1748*100,'.2f')), 
                                float(format(len(no_breakouts_tags_d[k])/10,'.2f')))))

    tags_count = sorted(tags_count, key=lambda x:x[1][0], reverse=False)
    draw_bar(dict(tags_count), "../data/main_tagged_tracks/tags_count.html")
Exemple #8
0
def update_subtracks_music_words():
    conn = MyConn()
    valid_tracks_db = [
        r[0] for r in conn.query(
            sql="SELECT track_id FROM sub_tracks WHERE is_valid=1")
    ]
    with open("../data/reviews_feature_words_with_freqs/breakouts_wo_simi.json"
              ) as f:
        data = json.load(f)
        valid_tracks_pos = list(
            set([bid.split('-')[0] for bid in data if data[bid]["len"] >= 5]))
    with open(
            "../data/reviews_feature_words_with_freqs/no_breakouts_wo_simi.json"
    ) as f:
        data = json.load(f)
        valid_tracks_neg = [str(tid) for tid in data if data[tid]["len"] >= 5]
    valid_tracks = valid_tracks_pos + valid_tracks_neg
    print(len(valid_tracks_db))
    print(len(valid_tracks), len(valid_tracks_pos), len(valid_tracks_neg))
    for tid in valid_tracks_db:
        if tid not in valid_tracks:
            conn.update(table="sub_tracks",
                        settings={"is_valid": 0},
                        conditions={"track_id": tid})
            print(tid)
Exemple #9
0
def artist_vec_from_tags(min_tags_num=2):
    conn = MyConn()
    artists = conn.query(table="artists", targets=["name", "nid"])
    tracks_artists = conn.query(table="details", targets=["track_id", "artists"])
    d_artist_tracks = {} # 记录歌手对应的歌曲集
    for ar, nid in artists:
        if nid=="0": continue
        d_artist_tracks[ar.lower().strip()] = []

    tracks = set()
    for tid, t_artists in tracks_artists:
        tracks.add(tid)
        t_artists = t_artists.lower().strip().split(",")
        for ar in t_artists:
            if ar in d_artist_tracks:
                d_artist_tracks[ar].append(tid)


    tracks_tags = conn.query(sql="SELECT track_id, tags FROM tracks")
    tags = open("../data_related/自带tags.txt").read().splitlines()
    d_tag_index = dict([(t, i) for i, t in enumerate(tags)])
    d_track_tags_count = {} # 记录歌曲对应的标签集
    for tid, t_tags in tracks_tags:
        if tid not in tracks: continue
        t_vec = np.zeros((len(tags),))
        t_tags = t_tags.split()
        for t in t_tags:
            t_vec[d_tag_index[t]] += 1
        d_track_tags_count[tid] = t_vec

    d_artist_tags_count = {} # 记录歌手对应的标签集
    for ar, ar_tracks in d_artist_tracks.items():
        if len(ar_tracks)==0: continue
        ar_vec = np.sum(np.array([d_track_tags_count[tid] for tid in ar_tracks]), axis=0)
        if np.sum(ar_vec, axis=None)>=min_tags_num:
            d_artist_tags_count[ar] = ar_vec

    artists = list(d_artist_tags_count.keys())
    ar_vecs = list(d_artist_tags_count.values())
    ar_vecs = np.mat(ar_vecs).T

    # scaled_ar_vecs = StandardScaler().fit_transform(ar_vecs) # mean=0, std=1
    scaled_ar_vecs = MinMaxScaler().fit_transform(ar_vecs) # [0,1]
    scaled_ar_vecs = np.mat(scaled_ar_vecs).T

    # 统计
    tags_count = np.sum(np.array(ar_vecs), axis=0)
    # for i in range(len(tags)):
    #     print(tags[i], tags_count[i])
    print(len(artists))

    d_artist_vec = {}
    for i in range(len(artists)):
        d_artist_vec[artists[i]] = np.array(scaled_ar_vecs[i]).ravel()

    # d_artist_vec = dict(zip(artists, scaled_ar_vecs))
    with open("../data/r_minmax_artists_vec_dict.pkl", "wb") as f:
        pickle.dump(d_artist_vec, f)
Exemple #10
0
def update_subtracks_havesimis():
    conn = MyConn()
    valid_tracks = set([
        r[0] for r in conn.query(
            sql="SELECT track_id FROM breakouts WHERE simi_score>=0.5")
    ])
    for tid in valid_tracks:
        conn.update(table="sub_tracks",
                    settings={"have_simis": 1},
                    conditions={"track_id": tid})
Exemple #11
0
def chorus_duration_distribution():
    conn = MyConn()
    sql = "SELECT chorus_start, chorus_end FROM tracks WHERE chorus_start IS NOT NULL A ND chorus_end IS NOT NULL"
    res = conn.query(sql=sql)
    res = list(filter(lambda x: x[0] != 0, res))
    print(len(res))
    durations = [p[1] - p[0] for p in res]

    sns.displot(durations)
    plt.show()
Exemple #12
0
 def get_feature_words_counter(table):
     conn = MyConn()
     counter = Counter()
     res = [
         r[0].split()
         for r in conn.query(targets=["feature_words"], table=table)
     ]
     for r in res:
         counter.update(r)
     return counter
Exemple #13
0
def get_breakouts_num():
    conn = MyConn()
    breakouts = conn.query(targets=["id", "track_id"], table="breakouts")
    track_2_bnum = {}
    for id_, track_id in breakouts:
        if track_id in track_2_bnum:
            track_2_bnum[track_id] += 1
        else:
            track_2_bnum[track_id] = 1
    for k, v in track_2_bnum.items():
        conn.update(table="sub_tracks",
                    settings={"bnum": v},
                    conditions={"track_id": k})
Exemple #14
0
def get_tracks_set_db(sql, conditions):
    '''
    从数据库中获取符合条件的歌曲集
    params:
        + sql: 如 'SELECT track_id FROM tracks WHERE have_lyrics=%s'
        + conditions: 如 {"have_lyrics":1}
    return: tracks_set
    '''
    conn = MyConn()
    res = conn.query(sql=sql, conditions=conditions)
    res = set([str(r[0]) for r in res])

    return res
Exemple #15
0
def get_specific_reviews(track_id, date):
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/b1.mod")
    filepath = "/Volumes/nmusic/NetEase2020/data" + conn.query(
        targets=["reviews_path"],
        conditions={"track_id": track_id},
        fetchall=False)[0]
    df = get_reviews_df(filepath)
    reviews = df[df["date"] == date]["content"].values
    reviews = "\n".join(reviews)
    # print(reviews)
    top_words = tags_extractor(reviews, topk=30, w2v_model=w2v_model)
    print(top_words)
Exemple #16
0
def add_no_breakouts_feature_words_to_db():
    '''
    向表格 no_breakouts_feature_words 中添加数据。
    '''
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/c4.mod")
    rubbish_words_fake = open(
        "../resources/rubbish_words_fake.txt").read().splitlines()
    candidates = open("../resources/music_words_cbm.txt").read().splitlines()
    # tfidf_model = models.TfidfModel.load("../models/bow/corpora_tfidf.model")
    # dictionary = corpora.Dictionary.load("../models/bow/corpora_dict.dict")
    # stoi = dictionary.token2id
    # itos = dict(zip(stoi.values(), stoi.keys()))
    data = conn.query(sql="SELECT id, track_id, text_path FROM no_breakouts")
    d_data = {}
    for id_, track_id, text_path in data:
        if track_id in d_data:
            d_data[track_id].append((id_, text_path))
        else:
            d_data[track_id] = [(id_, text_path)]
    print(len(d_data))

    for track_id, v in d_data.items():
        try:
            text = ""
            for id_, text_path in v:
                text += open(text_path).read()
            feature_words_mode = "candidates"  # raw, stop, tfidf
            col = "feature_words_candidates"  # feature_words, feature_words_wo_fake, feature_words_tfidf
            feature_words = get_feature_words(text,
                                              topk=10,
                                              mode=feature_words_mode,
                                              w2v_model=w2v_model,
                                              candidates=candidates,
                                              return_freq=True)
            for p in feature_words:
                print("{}:{:.3f}".format(p[0], p[1] * 100), end=" ")
            print()
            if len(feature_words) < 5:
                print(track_id, "not enough words.")
                continue
            # feature_words = " ".join(feature_words)
            # conn.insert(table="no_breakouts_feature_words", settings={"id":id_, "track_id":track_id, col:feature_words})
            # conn.update(table="no_breakouts_feature_words", settings={col:feature_words}, conditions={"track_id":track_id})
        except KeyboardInterrupt:
            break
        except:
            print(track_id)
            print(traceback.format_exc())
Exemple #17
0
def mark_language():
	'''
	对歌词库中的所有歌曲进行语种标记。
	'''
	conn = MyConn()
	enchant_dict = enchant.Dict("en_US")
	for track_id, lyrics_path in conn.query(sql="SELECT track_id, lyrics_path FROM tracks WHERE lyrics_path is not null"):
		with open(lyrics_path) as f:
			content = json.load(f)
		lyrics = replace_noise(content["lrc"]["lyric"])
		lyrics = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", lyrics)
		if len(lyrics)<10: # 说明没有东西
			language = "empty"
		language = _mark_language(lyrics, enchant_dict)
		conn.update(table="tracks", settings={"language":language}, conditions={"track_id":track_id})
Exemple #18
0
def build_tfidf_model(tracks_set):
	'''
	数据来源:breakout_tracks_set, no_breakout_tracks_set
	处理办法:每首歌至多抽取1000条评论(随机),取topk=20构建doc
	'''

	conn = MyConn()
	w2v_model = models.Word2Vec.load("/Users/inkding/Desktop/partial_netease/models/word2vec/b1.mod")
	files = []
	for track_id in tracks_set:
		files.append(conn.query(targets=["text_path"], conditions={"track_id": track_id}, fetchall=False)[0])

	docs = []
	for i, file in enumerate(files):
		print(i)
		# content = open(file).read()[:1000]
		content = open(file).read().splitlines()
		content = random.sample(content, min(100, len(content)))
		content = "\n".join(content)
		docs.append(tags_extractor(content, topk=20, w2v_model=w2v_model))
		if i==50: 
			for d in docs:
				print(d)
			break


	dictionary = corpora.Dictionary(docs)
	bows = [dictionary.doc2bow(doc) for doc in docs]
	tfidf_model = models.TfidfModel(bows)

	dictionary.save('../models/bow/1/corpora_dict.dict') # 重载用corpora.Dictionary.load(path)
	tfidf_model.save('../models/bow/1/corpora_tfidf.model') # 重载用models.TfidfModel.load(path)

	# 获取字典
	stoi = dictionary.token2id
	print("words num:",len(stoi))
	itos = dict(zip(stoi.values(), stoi.keys()))

	# test
	for i in range(20):
		test_doc = docs[i]
		test_bow = dictionary.doc2bow(test_doc)
		# 得到tf-idf表示
		test_tfidf = sorted(tfidf_model[test_bow], key=lambda x:x[1], reverse=True)
		print(test_doc)
		for item in test_tfidf[:5]:
			print(itos[item[0]], item[1])
		print()
Exemple #19
0
def basic_analysis(tracks_set):
    '''
    对指定的歌曲集进行基本分析:评论数、时间跨度...
    '''
    conn = MyConn()
    # 数据准备
    data = []
    targets = ["track_id", "tags", "reviews_num", "first_review", "last_review"]
    for tid in tracks_set:
        res = conn.query(targets=targets, conditions={"track_id": int(tid)})
        data.append(res[0])
    
    df = pd.DataFrame(data, columns=targets)
    # df.to_csv("../results/main_tagged_tracks/basic_info.csv", encoding="utf_8_sig", index=False)

    draw_hist(df["reviews_num"].values, log_scale=True, color="tab:orange")
Exemple #20
0
def get_reviews_vec_with_freq(track_id,
                              breakout,
                              w2v_model,
                              d_breakouts,
                              d_no_breakouts,
                              d_pos_track_breakout,
                              with_freq=True):
    conn = MyConn()
    if breakout:
        bid = d_pos_track_breakout[track_id]
        feature_words = d_breakouts[bid]["words"]
        freqs = d_breakouts[bid]["freqs"]

    else:
        feature_words = d_no_breakouts[track_id]["words"]
        freqs = d_no_breakouts[track_id]["freqs"]

    if len(feature_words) < 5:
        print(track_id, breakout)

    reviews_vec = []
    for i, w in enumerate(feature_words):
        vec = get_w2v_vector(w, w2v_model)
        if with_freq:
            vec = np.concatenate((vec, np.array([freqs[i] * 100])))
        reviews_vec.append(vec)
    return reviews_vec
Exemple #21
0
def build_dataset_embed(w_path):
    # ts1 = open("../data/main_tagged_tracks/tracks.txt").read().splitlines()[:1000]
    ts1 = list(
        pd.read_json("../data/breakouts-u2.json")["track_id"].unique())[:1000]
    ts2 = open("../data/no_breakouts_tracks.txt").read().splitlines()[:1000]
    print(len(ts1), len(ts2))

    # 是否爆发
    tracks_set = [(tid, 1) for tid in ts1]
    tracks_set += [(tid, 0) for tid in ts2]

    # 加载模型
    conn = MyConn()
    d2v_model = Doc2Vec.load("../models/d2v/d2v_a1.mod")

    config = Config()
    mf_path = "MyModel/models/3/mf_extractor-e3.pkl"
    if_path = "MyModel/models/3/if_embed-e3.pkl"
    music_feature_extractor = MusicFeatureExtractor(config)
    music_feature_extractor.load_state_dict(torch.load(mf_path))
    intrinsic_feature_embed = IntrinsicFeatureEmbed(config)
    intrinsic_feature_embed.load_state_dict(torch.load(if_path))
    music_feature_extractor.eval()
    intrinsic_feature_embed.eval()

    X, y = get_X_y_embed(dict(tracks_set), conn, d2v_model,
                         music_feature_extractor, intrinsic_feature_embed)

    with open(w_path, 'wb') as f:
        pickle.dump([X, y], f)
Exemple #22
0
def add_no_breakouts_feature_words_to_json():
    conn = MyConn()
    w2v_model = Word2Vec.load("../models/w2v/c4.mod")
    rubbish_words_fake = open(
        "../resources/rubbish_words_fake.txt").read().splitlines()
    candidates = open("../resources/music_words/music_words_cls_pos_pred.txt"
                      ).read().splitlines()
    # remove = open("../resources/music_words/music_words_similar.txt").read().splitlines()
    # candidates = [w for w in candidates if w not in remove]
    data = conn.query(sql="SELECT id, track_id, text_path FROM no_breakouts")
    d_data = {}
    for id_, track_id, text_path in data:
        if track_id in d_data:
            d_data[track_id].append((id_, text_path))
        else:
            d_data[track_id] = [(id_, text_path)]
    print(len(d_data))

    json_data = {}
    for track_id, v in list(d_data.items()):
        try:
            text = ""
            for id_, text_path in v:
                text += open(text_path).read()
            feature_words_mode = "candidates"  # raw, stop, tfidf
            feature_words = get_feature_words(text,
                                              topk=10,
                                              mode=feature_words_mode,
                                              w2v_model=w2v_model,
                                              candidates=candidates,
                                              return_freq=True)
            words, freqs = zip(*feature_words)
            json_data[track_id] = {
                "words": words,
                "freqs": freqs,
                "len": len(words)
            }
            if len(feature_words) < 5:
                print(track_id, "not enough words.")
        except KeyboardInterrupt:
            break
        except:
            print(track_id)
            print(traceback.format_exc())
    with open("../data/reviews_feature_words_with_freqs/no_breakouts_cls.json",
              'w') as f:
        json.dump(json_data, f, indent=2, ensure_ascii=False)
Exemple #23
0
def view_reviews_num_curve(track_id, min_reviews=200, save_path=None):
    '''
    绘制给定歌曲id的评论数量变化曲线(标注爆发点)
    '''
    conn = MyConn()

    json_path = conn.query(targets=["reviews_path"],
                           conditions={"track_id": track_id})
    if len(json_path) > 0:
        json_path = "/Volumes/nmusic/NetEase2020/data" + json_path[0][0]
    else:
        return None

    df = get_reviews_df(json_path)
    reviews_count, dates = get_reviews_count(df["date"].values)
    breakouts_group = get_breakouts(reviews_count, min_reviews=min_reviews)

    fig, ax = plt.subplots()
    x = list(range(len(reviews_count)))
    ax.plot(x, reviews_count)
    ax.xaxis.set_major_formatter(plt.NullFormatter())

    palette = plt.get_cmap('Paired')(np.linspace(0, 1, 10))
    y_head, beta_head = [], []
    for i in range(min(len(breakouts_group), 10)):
        x = list(zip(*breakouts_group[i]))[0]
        y = [reviews_count[i] for i in x]
        y_head.append(y[0])
        beta_head.append(breakouts_group[i][0][1])
        ax.scatter(x=x, y=y, color=palette[i])
        ax.xaxis.set_major_formatter(plt.NullFormatter())
    ax.set_xlabel("time")
    ax.set_ylabel("reviews_num")

    # text = '\n'.join(["count:{}, beta:{}".format(y_head[i], beta_head[i])
    #                      for i in range(len(y_head))])
    # ax.text(0, 1, text, verticalalignment="top", horizontalalignment="left", transform=ax.transAxes)

    if save_path:
        if not os.path.exists(os.path.dirname(save_path)):
            os.makedirs(os.path.dirname(save_path))
        plt.savefig(save_path)
    else:
        plt.show()

    plt.close()
Exemple #24
0
def build_dataset():
    conn = MyConn()
    dataset_size = 1500
    # conditional_sql = "rawmusic_path IS NOT NULL AND language in ('ch', 'en')"
    pos_tracks = conn.query(
        sql=
        "SELECT track_id FROM sub_tracks WHERE valid_bnum>0 AND is_valid=1 LIMIT {}"
        .format(dataset_size))
    neg_tracks = conn.query(
        sql=
        "SELECT track_id FROM sub_tracks WHERE valid_bnum=0 AND is_valid=1 LIMIT {}"
        .format(dataset_size))
    lyrics_d2v_model = Doc2Vec.load("../models/d2v/d2v_b1.mod")  # 歌词d2v模型
    with open("../data/artists_vec_dict_r_minmax.pkl", "rb") as f:
        d_artist_vec = pickle.load(f)

    X, y = [], []
    args = {
        "lyrics_d2v_model": lyrics_d2v_model,
        "d_artist_vec": d_artist_vec,
        "use_mp3": True,
        "use_lyrics": True,
        "use_artist": True,
        "music_datatype": "vggish"
    }

    def add_data(tracks, label):
        for t in tracks:
            try:
                X.append(get_X(track_id=t, **args))
                y.append(label)
            except KeyboardInterrupt:
                print("KeyboardInterrupt")
                break
            except:
                print(label, t)
                print(traceback.format_exc())

    add_data(pos_tracks, 1)
    add_data(neg_tracks, 0)

    dataset_index = "0317_vggish"
    dataset_name = "m"*args["use_mp3"] + "l"*args["use_lyrics"] + "a"*args["use_artist"]\
                    + str(len(pos_tracks)) +'_'+ str(dataset_index)
    with open("../data/dataset/{}.pkl".format(dataset_name), 'wb') as f:
        pickle.dump([X, y], f)
Exemple #25
0
def update_path(table, key_col, col, root_dir, offset, overwrite=False):
    '''
     在数据库中更新路径
    '''
    conn = MyConn()
    count_update = 0
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if "OS" in file: continue
            filepath = os.path.join(root, file)
            key = file.split('/')[-1][:-offset]
            res = conn.query(table=table,
                             targets=[col],
                             conditions={key_col: key},
                             fetchall=False)
            if overwrite:
                conn.update(table=table,
                            settings={col: filepath},
                            conditions={key_col: key})
                count_update += 1
            else:
                if res and res[0] is None:
                    conn.update(table=table,
                                settings={col: filepath},
                                conditions={key_col: key})
                    count_update += 1
    print(count_update)
Exemple #26
0
def copy_columns(t1, t2, col, key_col="track_id"):
    '''
    在数据库中,将一张表某列的信息复制到另一张表
    params:
        t1: 被拷贝的表
        t2: 被粘贴的表
        col: 列名称
        key_col: 键值
    '''
    conn = MyConn()
    data = conn.query(table=t1, targets=[key_col, col])
    for key_v, v in data:
        try:
            conn.update(table=t2,
                        settings={col: v},
                        conditions={key_col: key_v})
        except:
            print("ERROR {}: {}".format(key_col, key_v))
Exemple #27
0
def breakouts_curve():
    '''
    绘制爆发曲线
    '''
    conn = MyConn()

    for i in range(6):
        save_dir = "../data/breakouts_curve_clusters/{}".format(i)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        tracks = [
            r[0] for r in conn.query(targets=["track_id"],
                                     table="breakouts_complements",
                                     conditions={"label6": i})
        ]
        for tid in tracks[:100]:
            save_path = os.path.join(save_dir, "{}.png".format(tid))
            view_reviews_num_curve(tid, save_path=save_path)
Exemple #28
0
def get_description_by_api():
    conn = MyConn()
    res = conn.query(targets=["name", "nid"], table="artists")
    name_2_id = dict([(r[0].lower().strip(), r[1]) for r in res])
    artists = open("../data_related/query_artists.txt").read().splitlines()
    print(len(artists))
    

    url_base = 'http://127.0.0.1:3000'
    # 代理服务器
    proxyHost = "http-dyn.abuyun.com"
    proxyPort = "9020"

    # 代理隧道验证信息
    proxyUser = "******"
    proxyPass = "******"

    proxy = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
      "host" : proxyHost,
      "port" : proxyPort,
      "user" : proxyUser,
      "pass" : proxyPass,
    }

    data = []
    for ar in artists:
        try:
            id_ = name_2_id[ar]
            if id_=="0":
                continue
            url = url_base + "/artist/desc?id={}&proxy={}".format(id_, proxy)
            res = requests.get(url, timeout=10).json()
            res["id"] = id_
            res["artist"] = ar
            data.append(res)
        except KeyboardInterrupt:
            print("interrupted by keyboard.")
            sys.exit(0)
        except Exception as e:
            print(ar, e)

    with open("../data/sup_artists_desc.json", 'w') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
Exemple #29
0
def upload_details():
    '''
    将歌曲的基本信息上传至数据库(歌曲名称、歌手姓名、专辑名称...)
    '''
    def extract_details(filename):
        with open(filename) as f:
            content = json.load(f)
        details = {
            "name":
            content["songs"][0]["name"],
            "artist":
            ",".join([item["name"] for item in content["songs"][0]["ar"]]),
            "pop":
            content["songs"][0]["pop"],
            "album":
            content["songs"][0]["al"]["name"]
        }
        return details

    read_path = "/Volumes/nmusic/NetEase2020/data/simple_proxied_tracks_details"
    conn = MyConn()

    for root, dirs, files in os.walk(read_path):
        for file in files:
            if "DS" in file: continue
            filepath = os.path.join(root, file)
            track_id = file[:-5]
            try:
                details = extract_details(filepath)
            except Exception as e:
                print(filepath)
                # print(traceback.format_exc())
                print(e)

            # print(details)
            conn.insert_or_update(table="details",
                                  settings={
                                      "track_id": track_id,
                                      "name": details["name"],
                                      "artist": details["artist"],
                                      "album": details["album"],
                                      "pop": details["pop"]
                                  })
Exemple #30
0
def test_d2v_with_source(text, model, topn=5):
    conn = MyConn()
    source_tracks = open(
        "../data_related/lyrics_valid_tracks.txt").read().splitlines()
    text = replace_noise(text)
    text = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", text)
    words = cut(text, join_en=False)
    vec = model.infer_vector(words)
    s = model.docvecs.most_similar([vec], topn=10)
    print(s)