def get_reviews_topk_words(track_id, is_breakout, key): conn = MyConn() if key == "w_fake": col = "feature_words" elif key == "wo_fake": col = "feature_words_wo_fake" elif key == "tfidf": col = "feature_words_tfidf" elif key == "candidates": col = "feature_words_candidates" if is_breakout == 1: bids = [ r[0] for r in conn.query( sql= "SELECT id FROM breakouts WHERE is_valid=1 and simi_score>=0.5 and track_id={}" .format(track_id)) ] for bid in bids: feature_words = conn.query( sql="SELECT {} FROM breakouts_feature_words WHERE id='{}'". format(col, bid)) if feature_words and feature_words[0][0]: break else: feature_words = conn.query( sql="SELECT {} FROM no_breakouts_feature_words WHERE track_id={}". format(col, track_id)) if len(feature_words) > 0: feature_words = feature_words[0][0].split() return feature_words
def check_breakouts(): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c3.mod") tracks = conn.query( sql="SELECT track_id, json_path FROM sub_tracks WHERE bnum>0") for track_id, filepath in tracks[70:]: d_reviews_partitions = get_reviews_partitions(filepath, w2v_model, merge_num=2) # print(d_reviews_partitions) breakouts = conn.query(table="breakouts", targets=["date", "reviews_num"], conditions={ "track_id": track_id, "release_drive": 0, "fake": 0, "capital_drive": 0 }) if not breakouts: continue d_bcount = dict( zip(d_reviews_partitions.keys(), [0] * len(d_reviews_partitions))) for dt, reviews_num in breakouts: date = datetime.strftime(dt, '%Y-%m-%d') for k in d_reviews_partitions: if k[0] <= date and date <= k[1]: d_bcount[k] += 1 break print(track_id) for k, v in d_bcount.items(): if v > 0: print("{} - {}: {} [count: {}]".format(k[0], k[1], d_reviews_partitions[k], d_bcount[k]))
def in_tags_analysis(breakouts_set, no_breakouts_set): ''' 对指定的歌曲集的内置tags情况进行分析。 ''' tags = open("../data/metadata/自带tags.txt").read().splitlines() breakouts_tags_d = {} no_breakouts_tags_d = {} for t in tags: breakouts_tags_d[t] = [] no_breakouts_tags_d[t] = [] conn = MyConn() for tid in breakouts_set: res = conn.query(targets=["tags"], conditions={"track_id":tid})[0] for t in res[0].split(): breakouts_tags_d[t].append(tid) for tid in no_breakouts_set: res = conn.query(targets=["tags"], conditions={"track_id":tid})[0] for t in res[0].split(): no_breakouts_tags_d[t].append(tid) tags_count = [] for k in breakouts_tags_d: tags_count.append((k, (float(format(len(breakouts_tags_d[k])/1748*100,'.2f')), float(format(len(no_breakouts_tags_d[k])/10,'.2f'))))) tags_count = sorted(tags_count, key=lambda x:x[1][0], reverse=False) draw_bar(dict(tags_count), "../data/main_tagged_tracks/tags_count.html")
def identify_release_drive_breakouts(): ''' 找出由于新歌发布导致爆发的样本点(位于最头部) ''' conn = MyConn() breakouts = conn.query(targets=["id", "track_id", "date"], table="breakouts") release_breakouts_count = 0 release_breakouts_tracks_set = set() more_breakouts_tracks_set = set() for b in breakouts: track_first_review = conn.query(targets=["first_review"], conditions={"track_id": b[1]}, fetchall=False)[0] if b[2] - track_first_review < datetime.timedelta(days=15): release_breakouts_count += 1 release_breakouts_tracks_set.add(b[1]) conn.update(table="breakouts", settings={"release_drive": 1}, conditions={"id": b[0]}) else: more_breakouts_tracks_set.add(b[1]) print(release_breakouts_count) print(len(release_breakouts_tracks_set)) print(len(more_breakouts_tracks_set))
def refine_subtracks(): ''' 进一步精炼sub_tracks表格。 + 将只存在非法爆发点的歌曲除去(release_drive / capital drive / fake) + 要求存在 beta>=50&&reviews_num>=100 的爆发点 ''' conn = MyConn() targets = ("id", "track_id", "beta", "reviews_num", "release_drive", "capital_drive", "fake") breakouts = conn.query(table="breakouts", targets=targets) d_track_valid_bnum = {} for b in breakouts: d_tmp = dict(zip(targets, b)) if d_tmp["beta"]>=50 and d_tmp["reviews_num"]>=100 and \ d_tmp["release_drive"]+d_tmp["capital_drive"]+d_tmp["fake"]==0: tid = d_tmp["track_id"] if tid in d_track_valid_bnum: d_track_valid_bnum[tid] += 1 else: d_track_valid_bnum[tid] = 1 subtracks = [ r[0] for r in conn.query(sql="SELECT track_id FROM sub_tracks WHERE bnum>0") ] count_valid = 0 for tid in subtracks: if tid in d_track_valid_bnum: count_valid += 1 else: # print(tid, end=", ") conn.delete(table="sub_tracks", conditions={"track_id": tid}) print("\n", count_valid)
def artist_vec_from_tags(min_tags_num=2): conn = MyConn() artists = conn.query(table="artists", targets=["name", "nid"]) tracks_artists = conn.query(table="details", targets=["track_id", "artists"]) d_artist_tracks = {} # 记录歌手对应的歌曲集 for ar, nid in artists: if nid=="0": continue d_artist_tracks[ar.lower().strip()] = [] tracks = set() for tid, t_artists in tracks_artists: tracks.add(tid) t_artists = t_artists.lower().strip().split(",") for ar in t_artists: if ar in d_artist_tracks: d_artist_tracks[ar].append(tid) tracks_tags = conn.query(sql="SELECT track_id, tags FROM tracks") tags = open("../data_related/自带tags.txt").read().splitlines() d_tag_index = dict([(t, i) for i, t in enumerate(tags)]) d_track_tags_count = {} # 记录歌曲对应的标签集 for tid, t_tags in tracks_tags: if tid not in tracks: continue t_vec = np.zeros((len(tags),)) t_tags = t_tags.split() for t in t_tags: t_vec[d_tag_index[t]] += 1 d_track_tags_count[tid] = t_vec d_artist_tags_count = {} # 记录歌手对应的标签集 for ar, ar_tracks in d_artist_tracks.items(): if len(ar_tracks)==0: continue ar_vec = np.sum(np.array([d_track_tags_count[tid] for tid in ar_tracks]), axis=0) if np.sum(ar_vec, axis=None)>=min_tags_num: d_artist_tags_count[ar] = ar_vec artists = list(d_artist_tags_count.keys()) ar_vecs = list(d_artist_tags_count.values()) ar_vecs = np.mat(ar_vecs).T # scaled_ar_vecs = StandardScaler().fit_transform(ar_vecs) # mean=0, std=1 scaled_ar_vecs = MinMaxScaler().fit_transform(ar_vecs) # [0,1] scaled_ar_vecs = np.mat(scaled_ar_vecs).T # 统计 tags_count = np.sum(np.array(ar_vecs), axis=0) # for i in range(len(tags)): # print(tags[i], tags_count[i]) print(len(artists)) d_artist_vec = {} for i in range(len(artists)): d_artist_vec[artists[i]] = np.array(scaled_ar_vecs[i]).ravel() # d_artist_vec = dict(zip(artists, scaled_ar_vecs)) with open("../data/r_minmax_artists_vec_dict.pkl", "wb") as f: pickle.dump(d_artist_vec, f)
def breakouts_complements(): ''' 对爆发信息进一步补充(用于对爆发分类) ''' conn = MyConn() logspace = [(0, 100), (100, 180), (180, 326), (326, 589), (589, 1066), (1066, 3494), (3494, 30000)] blevel_num = len(logspace) logspace_count = dict(zip(logspace, blevel_num * [0])) breakout_tracks = [ r[0] for r in conn.query(targets=["DISTINCT(track_id)"], table="breakouts", conditions={"release_drive": 0}) ] for track_id in breakout_tracks: reviews_num, first_review, last_review = conn.query( targets=["reviews_num", "first_review", "last_review"], conditions={"track_id": track_id}, fetchall=False) breakouts = conn.query( targets=["flag", "reviews_num", "beta", "release_drive"], table="breakouts", conditions={"track_id": track_id}) days_num = (last_review - first_review).days # 除去爆发点的平均评论数 avg_normal = float((reviews_num - np.sum([b[1] for b in breakouts])) / (days_num - len(breakouts))) blevel_vec = blevel_num * [0] for b in breakouts: if b[3] == 1: continue # 不考虑release_drive爆发 for i in range(blevel_num): if b[2] >= logspace[i][0] and b[2] < logspace[i][1]: # 考察beta区间 blevel_vec[i] += 1 logspace_count[logspace[i]] += 1 break breakouts_num = int(np.sum(blevel_vec)) blevel = 0 for i in range(len(blevel_vec)): blevel += i * blevel_vec[i] blevel = blevel * 1.0 / breakouts_num settings = { "track_id": track_id, "average_reviews_num": avg_normal, "blevel_vec": ' '.join(map(str, blevel_vec)), "breakouts_num": breakouts_num, "blevel": blevel } conn.insert_or_update(table="breakouts_complements", settings=settings) # print(settings) print(track_id)
def get_X(track_id, use_mp3, use_lyrics, use_artist, lyrics_d2v_model, d_artist_vec, music_datatype="mfcc"): conn = MyConn() rawmusic_path, vggish_embed_path, lyrics_path, artist = conn.query( table="sub_tracks", conditions={"track_id": track_id}, fetchall=False, targets=[ "rawmusic_path", "vggish_embed_path", "lyrics_path", "artist" ]) vecs = [] if use_mp3: if music_datatype == "mfcc": music_vec = get_mfcc(rawmusic_path).ravel() elif music_datatype == "vggish": with open(vggish_embed_path, "rb") as f: music_vec = pickle.load(f).detach().numpy() vecs.append(music_vec) if use_lyrics: lyrics_vec = get_d2v_vector(lyrics_path, lyrics_d2v_model) vecs.append(lyrics_vec) if use_artist: artist_vec = d_artist_vec[artist.lower().strip()] vecs.append(artist_vec) features_vec = concatenate_features(vecs) return features_vec
def update_subtracks_music_words(): conn = MyConn() valid_tracks_db = [ r[0] for r in conn.query( sql="SELECT track_id FROM sub_tracks WHERE is_valid=1") ] with open("../data/reviews_feature_words_with_freqs/breakouts_wo_simi.json" ) as f: data = json.load(f) valid_tracks_pos = list( set([bid.split('-')[0] for bid in data if data[bid]["len"] >= 5])) with open( "../data/reviews_feature_words_with_freqs/no_breakouts_wo_simi.json" ) as f: data = json.load(f) valid_tracks_neg = [str(tid) for tid in data if data[tid]["len"] >= 5] valid_tracks = valid_tracks_pos + valid_tracks_neg print(len(valid_tracks_db)) print(len(valid_tracks), len(valid_tracks_pos), len(valid_tracks_neg)) for tid in valid_tracks_db: if tid not in valid_tracks: conn.update(table="sub_tracks", settings={"is_valid": 0}, conditions={"track_id": tid}) print(tid)
def check_feature_words(): conn = MyConn() breakouts_feature_words = Counter() res = [ r[0].split() for r in conn.query(targets=["feature_words"], table="breakouts_feature_words_1") ] for r in res: breakouts_feature_words.update(r) valid_breakouts_feature_words = [ p[0] for p in filter(lambda x: x[1] >= 30, breakouts_feature_words.most_common()) ] # no_breakouts_feature_words = Counter() # res = [r[0].split() for r in conn.query(targets=["feature_words"], table="no_breakouts_feature_words_1")] # for r in res: # no_breakouts_feature_words.update(r) # valid_no_breakouts_feature_words = [p[0] for p in filter(lambda x:x[1]>=30, no_breakouts_feature_words.most_common())] intersection = set(valid_breakouts_feature_words).intersection( set(valid_no_breakouts_feature_words)) print("intersection:\n", intersection) print("breakouts_unique:\n", set(valid_breakouts_feature_words) - intersection) print("no_breakouts_unique:\n", set(valid_no_breakouts_feature_words) - intersection)
def rubbish_tags(): ''' + 统计每个爆发点关键词中rubbish_tags的个数 + 将垃圾标签占比大的样本点作为噪声筛去 + 将关键词中的垃圾标签删除并上传至数据库 ''' rubbish = open("../resources/rubbish_tags.txt").read().splitlines() conn = MyConn() records = [] for res in conn.query(targets=["id", "feature_words"], table="breakouts_feature_words_c3"): # if conn.query(table="breakouts", targets=["release_drive"], fetchall=False, conditions={"id": res[0]})[0] == 1: # continue feature_words = res[1].split() rubbish_count = 0 for w in feature_words: if w in rubbish: rubbish_count += 1 records.append([res[0], rubbish_count, feature_words]) records.sort(key=lambda x: x[1], reverse=True) for r in records: print(r)
def update_path(table, key_col, col, root_dir, offset, overwrite=False): ''' 在数据库中更新路径 ''' conn = MyConn() count_update = 0 for root, dirs, files in os.walk(root_dir): for file in files: if "OS" in file: continue filepath = os.path.join(root, file) key = file.split('/')[-1][:-offset] res = conn.query(table=table, targets=[col], conditions={key_col: key}, fetchall=False) if overwrite: conn.update(table=table, settings={col: filepath}, conditions={key_col: key}) count_update += 1 else: if res and res[0] is None: conn.update(table=table, settings={col: filepath}, conditions={key_col: key}) count_update += 1 print(count_update)
def build_dataset(): conn = MyConn() dataset_size = 1500 # conditional_sql = "rawmusic_path IS NOT NULL AND language in ('ch', 'en')" pos_tracks = conn.query( sql= "SELECT track_id FROM sub_tracks WHERE valid_bnum>0 AND is_valid=1 LIMIT {}" .format(dataset_size)) neg_tracks = conn.query( sql= "SELECT track_id FROM sub_tracks WHERE valid_bnum=0 AND is_valid=1 LIMIT {}" .format(dataset_size)) lyrics_d2v_model = Doc2Vec.load("../models/d2v/d2v_b1.mod") # 歌词d2v模型 with open("../data/artists_vec_dict_r_minmax.pkl", "rb") as f: d_artist_vec = pickle.load(f) X, y = [], [] args = { "lyrics_d2v_model": lyrics_d2v_model, "d_artist_vec": d_artist_vec, "use_mp3": True, "use_lyrics": True, "use_artist": True, "music_datatype": "vggish" } def add_data(tracks, label): for t in tracks: try: X.append(get_X(track_id=t, **args)) y.append(label) except KeyboardInterrupt: print("KeyboardInterrupt") break except: print(label, t) print(traceback.format_exc()) add_data(pos_tracks, 1) add_data(neg_tracks, 0) dataset_index = "0317_vggish" dataset_name = "m"*args["use_mp3"] + "l"*args["use_lyrics"] + "a"*args["use_artist"]\ + str(len(pos_tracks)) +'_'+ str(dataset_index) with open("../data/dataset/{}.pkl".format(dataset_name), 'wb') as f: pickle.dump([X, y], f)
def chorus_duration_distribution(): conn = MyConn() sql = "SELECT chorus_start, chorus_end FROM tracks WHERE chorus_start IS NOT NULL A ND chorus_end IS NOT NULL" res = conn.query(sql=sql) res = list(filter(lambda x: x[0] != 0, res)) print(len(res)) durations = [p[1] - p[0] for p in res] sns.displot(durations) plt.show()
def get_feature_words_counter(table): conn = MyConn() counter = Counter() res = [ r[0].split() for r in conn.query(targets=["feature_words"], table=table) ] for r in res: counter.update(r) return counter
def update_subtracks_havesimis(): conn = MyConn() valid_tracks = set([ r[0] for r in conn.query( sql="SELECT track_id FROM breakouts WHERE simi_score>=0.5") ]) for tid in valid_tracks: conn.update(table="sub_tracks", settings={"have_simis": 1}, conditions={"track_id": tid})
def view_reviews_num_curve_html(track_id, save_dir, min_reviews=200): ''' 绘制给定歌曲id的评论数量变化曲线,利用pyecharts实现: + 爆发点与时间的对应 + 爆发点与feature_words的对应 ''' conn = MyConn() json_path = conn.query(targets=["reviews_path"], conditions={"track_id": track_id}) if len(json_path) > 0: json_path = "/Volumes/nmusic/NetEase2020/data" + json_path[0][0] else: return None if not os.path.exists(save_dir): os.makedirs(save_dir) df = get_reviews_df(json_path) reviews_count, dates = get_reviews_count(df["date"].values) breakouts_group = get_breakouts(reviews_count, min_reviews=min_reviews) breakouts = [g[0] for g in breakouts_group] x, y = dates, reviews_count mark_points = [] for flag, breakout in enumerate(breakouts): feature_words = conn.query( table="breakouts_feature_words_c3", targets=["filtered_feature_words"], conditions={"id": '-'.join([track_id, str(flag)])}, fetchall=False)[0] px, beta = breakout mark_points.append( opts.MarkPointItem(name="{}{}".format(dates[px], feature_words), coord=[dates[px], reviews_count[px]], value=beta)) c = (Line().add_xaxis(x).add_yaxis( "评论曲线", y, markpoint_opts=opts.MarkPointOpts(data=mark_points), ).set_global_opts(title_opts=opts.TitleOpts( title="{}".format(track_id))).render( os.path.join(save_dir, "{}.html".format(track_id))))
def get_reviews_vec(track_id, breakout, w2v_model, key="wo_fake"): ''' 指定歌曲获取评论文本向量组 ''' conn = MyConn() if key == "w_fake": col = "feature_words" elif key == "wo_fake": col = "feature_words_wo_fake" elif key == "tfidf": col = "feature_words_tfidf" elif key == "candidates": col = "feature_words_candidates" if breakout == 1: bids = [ r[0] for r in conn.query( sql= "SELECT id FROM breakouts WHERE is_valid=1 and simi_score>=0.5 and track_id={}" .format(track_id)) ] for bid in bids: feature_words = conn.query( sql="SELECT {} FROM breakouts_feature_words WHERE id='{}'". format(col, bid)) if feature_words and feature_words[0][0]: break else: feature_words = conn.query( sql="SELECT {} FROM no_breakouts_feature_words WHERE track_id={}". format(col, track_id)) if len(feature_words) > 0: feature_words = feature_words[0][0].split() # print(breakout, feature_words) reviews_vec = [] for w in feature_words: vec = get_w2v_vector(w, w2v_model) if vec is not None: reviews_vec.append(vec) return reviews_vec
def get_specific_reviews(track_id, date): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/b1.mod") filepath = "/Volumes/nmusic/NetEase2020/data" + conn.query( targets=["reviews_path"], conditions={"track_id": track_id}, fetchall=False)[0] df = get_reviews_df(filepath) reviews = df[df["date"] == date]["content"].values reviews = "\n".join(reviews) # print(reviews) top_words = tags_extractor(reviews, topk=30, w2v_model=w2v_model) print(top_words)
def get_breakouts_num(): conn = MyConn() breakouts = conn.query(targets=["id", "track_id"], table="breakouts") track_2_bnum = {} for id_, track_id in breakouts: if track_id in track_2_bnum: track_2_bnum[track_id] += 1 else: track_2_bnum[track_id] = 1 for k, v in track_2_bnum.items(): conn.update(table="sub_tracks", settings={"bnum": v}, conditions={"track_id": k})
def get_tracks_set_db(sql, conditions): ''' 从数据库中获取符合条件的歌曲集 params: + sql: 如 'SELECT track_id FROM tracks WHERE have_lyrics=%s' + conditions: 如 {"have_lyrics":1} return: tracks_set ''' conn = MyConn() res = conn.query(sql=sql, conditions=conditions) res = set([str(r[0]) for r in res]) return res
def divide_artists(): ''' 提取出爆发点和非爆发点涉及的歌手。 save: "breakouts_artists.txt": 爆发点涉及的歌手 save: "no_breakouts_artists.txt": 非爆发点涉及的歌手 ''' conn = MyConn() conditions = {"release_drive":0, "capital_drive":0, "fake":0} b_tracks = [r[0] for r in conn.query(targets=["distinct(track_id)"], table="breakouts", conditions=conditions)] nb_tracks = [r[0] for r in conn.query(targets=["distinct(track_id)"], table="no_breakouts")] b_arts, nb_arts = set(), set() for t in b_tracks: arts = conn.query(targets=["artist"], table="details", conditions={"track_id": t}, fetchall=False)[0].split(",") b_arts.update(arts) for t in nb_tracks: arts = conn.query(targets=["artist"], table="details", conditions={"track_id": t}, fetchall=False)[0].split(",") nb_arts.update(arts) with open("../data_related/breakouts_artists.txt", 'w') as f: f.write("\n".join(b_arts)) with open("../data_related/no_breakouts_artists.txt", 'w') as f: f.write("\n".join(nb_arts))
def build_train_test_dataset(): conn = MyConn() random.seed(21) train_size, test_size = 3000, 1000 size = train_size + test_size breakouts = random.sample([ r[0] for r in conn.query(targets=["id"], conditions={ "have_words": 1, "have_rawmusic": 1 }, table="breakouts") ], size) breakouts_train, breakouts_test = breakouts[:train_size], breakouts[ train_size:size + 1] no_breakouts = random.sample([ r[0] for r in conn.query(targets=["id"], conditions={ "have_words": 1, "have_rawmusic": 1 }, table="no_breakouts") ], size) no_breakouts_train, no_breakouts_test = no_breakouts[: train_size], no_breakouts[ train_size:size + 1] with open("../data/dataset/breakouts_id_train_2.txt", 'w') as f: f.write('\n'.join(breakouts_train)) with open("../data/dataset/breakouts_id_test_2.txt", 'w') as f: f.write('\n'.join(breakouts_test)) with open("../data/dataset/no_breakouts_id_train_2.txt", 'w') as f: f.write('\n'.join(no_breakouts_train)) with open("../data/dataset/no_breakouts_id_test_2.txt", 'w') as f: f.write('\n'.join(no_breakouts_test))
def test_my_cluster(): conn = MyConn() w2v_path = "../models/w2v/c4.mod" rubbish_tags = open( "../resources/rubbish_words_for_weather.txt").read().splitlines() w2v_model = Word2Vec.load(w2v_path) valid_breakouts = conn.query( sql= "SELECT id, date, reviews_num FROM breakouts WHERE release_drive=0 AND capital_drive=0 AND fake=0" ) valid_breakouts_info_d = dict( zip([p[0] for p in valid_breakouts], [(p[1], p[2]) for p in valid_breakouts])) breakouts_id_tags_p = conn.query(table="breakouts_feature_words_c3", targets=["id", "clean_feature_words"]) tags_pool = [] for id_, tags in breakouts_id_tags_p: if id_ in valid_breakouts_info_d: b_date, b_size = valid_breakouts_info_d[id_] for t in tags.split(): if t not in rubbish_tags and w2v_model.wv.__contains__(t): tags_pool.append(Tag(t, b_date, b_size)) print(len(tags_pool)) # 24796 cluster_index = "weather" my_cluster = ClustersSet(w2v_path=w2v_path, affinity=0.55) my_cluster.grow(tags_pool) my_cluster.save( model_path="../models/my_cluster/my_cluster_{}.pkl".format( cluster_index), txt_path="../results/my_cluster_{}.txt".format(cluster_index), csv_path="../results/my_cluster_{}.csv".format(cluster_index), bsizes_csv_path="../results/my_clusters_{}_bsizes.csv".format( cluster_index))
def add_no_breakouts_feature_words_to_db(): ''' 向表格 no_breakouts_feature_words 中添加数据。 ''' conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c4.mod") rubbish_words_fake = open( "../resources/rubbish_words_fake.txt").read().splitlines() candidates = open("../resources/music_words_cbm.txt").read().splitlines() # tfidf_model = models.TfidfModel.load("../models/bow/corpora_tfidf.model") # dictionary = corpora.Dictionary.load("../models/bow/corpora_dict.dict") # stoi = dictionary.token2id # itos = dict(zip(stoi.values(), stoi.keys())) data = conn.query(sql="SELECT id, track_id, text_path FROM no_breakouts") d_data = {} for id_, track_id, text_path in data: if track_id in d_data: d_data[track_id].append((id_, text_path)) else: d_data[track_id] = [(id_, text_path)] print(len(d_data)) for track_id, v in d_data.items(): try: text = "" for id_, text_path in v: text += open(text_path).read() feature_words_mode = "candidates" # raw, stop, tfidf col = "feature_words_candidates" # feature_words, feature_words_wo_fake, feature_words_tfidf feature_words = get_feature_words(text, topk=10, mode=feature_words_mode, w2v_model=w2v_model, candidates=candidates, return_freq=True) for p in feature_words: print("{}:{:.3f}".format(p[0], p[1] * 100), end=" ") print() if len(feature_words) < 5: print(track_id, "not enough words.") continue # feature_words = " ".join(feature_words) # conn.insert(table="no_breakouts_feature_words", settings={"id":id_, "track_id":track_id, col:feature_words}) # conn.update(table="no_breakouts_feature_words", settings={col:feature_words}, conditions={"track_id":track_id}) except KeyboardInterrupt: break except: print(track_id) print(traceback.format_exc())
def mark_language(): ''' 对歌词库中的所有歌曲进行语种标记。 ''' conn = MyConn() enchant_dict = enchant.Dict("en_US") for track_id, lyrics_path in conn.query(sql="SELECT track_id, lyrics_path FROM tracks WHERE lyrics_path is not null"): with open(lyrics_path) as f: content = json.load(f) lyrics = replace_noise(content["lrc"]["lyric"]) lyrics = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", lyrics) if len(lyrics)<10: # 说明没有东西 language = "empty" language = _mark_language(lyrics, enchant_dict) conn.update(table="tracks", settings={"language":language}, conditions={"track_id":track_id})
def build_tfidf_model(tracks_set): ''' 数据来源:breakout_tracks_set, no_breakout_tracks_set 处理办法:每首歌至多抽取1000条评论(随机),取topk=20构建doc ''' conn = MyConn() w2v_model = models.Word2Vec.load("/Users/inkding/Desktop/partial_netease/models/word2vec/b1.mod") files = [] for track_id in tracks_set: files.append(conn.query(targets=["text_path"], conditions={"track_id": track_id}, fetchall=False)[0]) docs = [] for i, file in enumerate(files): print(i) # content = open(file).read()[:1000] content = open(file).read().splitlines() content = random.sample(content, min(100, len(content))) content = "\n".join(content) docs.append(tags_extractor(content, topk=20, w2v_model=w2v_model)) if i==50: for d in docs: print(d) break dictionary = corpora.Dictionary(docs) bows = [dictionary.doc2bow(doc) for doc in docs] tfidf_model = models.TfidfModel(bows) dictionary.save('../models/bow/1/corpora_dict.dict') # 重载用corpora.Dictionary.load(path) tfidf_model.save('../models/bow/1/corpora_tfidf.model') # 重载用models.TfidfModel.load(path) # 获取字典 stoi = dictionary.token2id print("words num:",len(stoi)) itos = dict(zip(stoi.values(), stoi.keys())) # test for i in range(20): test_doc = docs[i] test_bow = dictionary.doc2bow(test_doc) # 得到tf-idf表示 test_tfidf = sorted(tfidf_model[test_bow], key=lambda x:x[1], reverse=True) print(test_doc) for item in test_tfidf[:5]: print(itos[item[0]], item[1]) print()
def basic_analysis(tracks_set): ''' 对指定的歌曲集进行基本分析:评论数、时间跨度... ''' conn = MyConn() # 数据准备 data = [] targets = ["track_id", "tags", "reviews_num", "first_review", "last_review"] for tid in tracks_set: res = conn.query(targets=targets, conditions={"track_id": int(tid)}) data.append(res[0]) df = pd.DataFrame(data, columns=targets) # df.to_csv("../results/main_tagged_tracks/basic_info.csv", encoding="utf_8_sig", index=False) draw_hist(df["reviews_num"].values, log_scale=True, color="tab:orange")
def add_no_breakouts_feature_words_to_json(): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c4.mod") rubbish_words_fake = open( "../resources/rubbish_words_fake.txt").read().splitlines() candidates = open("../resources/music_words/music_words_cls_pos_pred.txt" ).read().splitlines() # remove = open("../resources/music_words/music_words_similar.txt").read().splitlines() # candidates = [w for w in candidates if w not in remove] data = conn.query(sql="SELECT id, track_id, text_path FROM no_breakouts") d_data = {} for id_, track_id, text_path in data: if track_id in d_data: d_data[track_id].append((id_, text_path)) else: d_data[track_id] = [(id_, text_path)] print(len(d_data)) json_data = {} for track_id, v in list(d_data.items()): try: text = "" for id_, text_path in v: text += open(text_path).read() feature_words_mode = "candidates" # raw, stop, tfidf feature_words = get_feature_words(text, topk=10, mode=feature_words_mode, w2v_model=w2v_model, candidates=candidates, return_freq=True) words, freqs = zip(*feature_words) json_data[track_id] = { "words": words, "freqs": freqs, "len": len(words) } if len(feature_words) < 5: print(track_id, "not enough words.") except KeyboardInterrupt: break except: print(track_id) print(traceback.format_exc()) with open("../data/reviews_feature_words_with_freqs/no_breakouts_cls.json", 'w') as f: json.dump(json_data, f, indent=2, ensure_ascii=False)
def view_reviews_num_curve(track_id, min_reviews=200, save_path=None): ''' 绘制给定歌曲id的评论数量变化曲线(标注爆发点) ''' conn = MyConn() json_path = conn.query(targets=["reviews_path"], conditions={"track_id": track_id}) if len(json_path) > 0: json_path = "/Volumes/nmusic/NetEase2020/data" + json_path[0][0] else: return None df = get_reviews_df(json_path) reviews_count, dates = get_reviews_count(df["date"].values) breakouts_group = get_breakouts(reviews_count, min_reviews=min_reviews) fig, ax = plt.subplots() x = list(range(len(reviews_count))) ax.plot(x, reviews_count) ax.xaxis.set_major_formatter(plt.NullFormatter()) palette = plt.get_cmap('Paired')(np.linspace(0, 1, 10)) y_head, beta_head = [], [] for i in range(min(len(breakouts_group), 10)): x = list(zip(*breakouts_group[i]))[0] y = [reviews_count[i] for i in x] y_head.append(y[0]) beta_head.append(breakouts_group[i][0][1]) ax.scatter(x=x, y=y, color=palette[i]) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_xlabel("time") ax.set_ylabel("reviews_num") # text = '\n'.join(["count:{}, beta:{}".format(y_head[i], beta_head[i]) # for i in range(len(y_head))]) # ax.text(0, 1, text, verticalalignment="top", horizontalalignment="left", transform=ax.transAxes) if save_path: if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) plt.savefig(save_path) else: plt.show() plt.close()