def update_path(table, key_col, col, root_dir, offset, overwrite=False): ''' 在数据库中更新路径 ''' conn = MyConn() count_update = 0 for root, dirs, files in os.walk(root_dir): for file in files: if "OS" in file: continue filepath = os.path.join(root, file) key = file.split('/')[-1][:-offset] res = conn.query(table=table, targets=[col], conditions={key_col: key}, fetchall=False) if overwrite: conn.update(table=table, settings={col: filepath}, conditions={key_col: key}) count_update += 1 else: if res and res[0] is None: conn.update(table=table, settings={col: filepath}, conditions={key_col: key}) count_update += 1 print(count_update)
def identify_release_drive_breakouts(): ''' 找出由于新歌发布导致爆发的样本点(位于最头部) ''' conn = MyConn() breakouts = conn.query(targets=["id", "track_id", "date"], table="breakouts") release_breakouts_count = 0 release_breakouts_tracks_set = set() more_breakouts_tracks_set = set() for b in breakouts: track_first_review = conn.query(targets=["first_review"], conditions={"track_id": b[1]}, fetchall=False)[0] if b[2] - track_first_review < datetime.timedelta(days=15): release_breakouts_count += 1 release_breakouts_tracks_set.add(b[1]) conn.update(table="breakouts", settings={"release_drive": 1}, conditions={"id": b[0]}) else: more_breakouts_tracks_set.add(b[1]) print(release_breakouts_count) print(len(release_breakouts_tracks_set)) print(len(more_breakouts_tracks_set))
def update_subtracks_music_words(): conn = MyConn() valid_tracks_db = [ r[0] for r in conn.query( sql="SELECT track_id FROM sub_tracks WHERE is_valid=1") ] with open("../data/reviews_feature_words_with_freqs/breakouts_wo_simi.json" ) as f: data = json.load(f) valid_tracks_pos = list( set([bid.split('-')[0] for bid in data if data[bid]["len"] >= 5])) with open( "../data/reviews_feature_words_with_freqs/no_breakouts_wo_simi.json" ) as f: data = json.load(f) valid_tracks_neg = [str(tid) for tid in data if data[tid]["len"] >= 5] valid_tracks = valid_tracks_pos + valid_tracks_neg print(len(valid_tracks_db)) print(len(valid_tracks), len(valid_tracks_pos), len(valid_tracks_neg)) for tid in valid_tracks_db: if tid not in valid_tracks: conn.update(table="sub_tracks", settings={"is_valid": 0}, conditions={"track_id": tid}) print(tid)
def update_subtracks_havesimis(): conn = MyConn() valid_tracks = set([ r[0] for r in conn.query( sql="SELECT track_id FROM breakouts WHERE simi_score>=0.5") ]) for tid in valid_tracks: conn.update(table="sub_tracks", settings={"have_simis": 1}, conditions={"track_id": tid})
def get_breakouts_num(): conn = MyConn() breakouts = conn.query(targets=["id", "track_id"], table="breakouts") track_2_bnum = {} for id_, track_id in breakouts: if track_id in track_2_bnum: track_2_bnum[track_id] += 1 else: track_2_bnum[track_id] = 1 for k, v in track_2_bnum.items(): conn.update(table="sub_tracks", settings={"bnum": v}, conditions={"track_id": k})
def mark_language(): ''' 对歌词库中的所有歌曲进行语种标记。 ''' conn = MyConn() enchant_dict = enchant.Dict("en_US") for track_id, lyrics_path in conn.query(sql="SELECT track_id, lyrics_path FROM tracks WHERE lyrics_path is not null"): with open(lyrics_path) as f: content = json.load(f) lyrics = replace_noise(content["lrc"]["lyric"]) lyrics = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", lyrics) if len(lyrics)<10: # 说明没有东西 language = "empty" language = _mark_language(lyrics, enchant_dict) conn.update(table="tracks", settings={"language":language}, conditions={"track_id":track_id})
def copy_columns(t1, t2, col, key_col="track_id"): ''' 在数据库中,将一张表某列的信息复制到另一张表 params: t1: 被拷贝的表 t2: 被粘贴的表 col: 列名称 key_col: 键值 ''' conn = MyConn() data = conn.query(table=t1, targets=[key_col, col]) for key_v, v in data: try: conn.update(table=t2, settings={col: v}, conditions={key_col: key_v}) except: print("ERROR {}: {}".format(key_col, key_v))
def add_desc_source_2_database(): ''' 将歌手的文本来源写入数据库表格。 ''' conn = MyConn() with open("../data/name_2_KB_ents.pkl", "rb") as f: name_2_KB_ents = pickle.load(f) ents = set(name_2_KB_ents.keys()) with open("../data/sup_artists_desc_2.json") as f: content = json.load(f) sup_arts = set([item["artist"].lower() for item in content]) # print("KB: {}, Netease: {}, intersection: {}".format(len(ents), len(sup_arts), len(sup_arts.intersection(ents)))) for key in ents: conn.update(table="artists", settings={"source":1}, conditions={"lower_name":key}) for name in sup_arts: conn.update(table="artists", settings={"source":2}, conditions={"lower_name":name})
def update_special_tag1(): ''' 从之前生成的special_words表格中更新sub_tracks中的special_tag ''' conn = MyConn() sql = "SELECT id, special_words FROM breakouts_feature_words_c3 WHERE LENGTH(special_words)>0" data = conn.query(sql=sql) special_words = ["高考", "翻唱", "节日", "抖音"] for bid, text in data: words = text.split() for w in words: if w in special_words: tid = bid.split('-')[0] try: conn.update(table="sub_tracks", settings={"special_tag": w}, conditions={"track_id": tid}) except: print(tid) break
def task(thread_id, task_args): ''' 检测 chorus_start 并上传至数据库(tracks & sub_tracks) ''' conn = MyConn() while 1: sql = "SELECT track_id, mp3_path from sub_tracks WHERE chorus_start is NULL AND valid_bnum=0" task_args["lock"].acquire() res = conn.query(sql=sql, fetchall=False) if len(res) == 0: return # sub_tracks表格中不存在chorus_start is null的歌曲 tid, mp3_path = res[0], res[1] conn.update(table="sub_tracks", settings={"chorus_start": 0}, conditions={"track_id": tid}) task_args["lock"].release() try: # 检测并截取歌曲的副歌部分,设置时长为20s duration = 20 chorus = get_chorus(mp3_path, clip_length=duration) if chorus is None: conn.update(table="sub_tracks", settings={"chorus_start": -1}, conditions={"track_id": tid}) else: chorus_start, chorus_end = chorus conn.update(table="sub_tracks", settings={"chorus_start": chorus_start}, conditions={"track_id": tid}) conn.update(table="tracks", settings={"chorus_start": chorus_start}, conditions={"track_id": tid}) except KeyboardInterrupt: conn.update( sql= "UPDATE sub_tracks SET chorus_start=NULL WHERE track_id={}" .format(tid)) return except: print(tid, traceback.format_exc())
def task(pid, task_args): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c4.mod") while 1: task_args["lock"].acquire() res = conn.query(targets=["id", "text_path"], conditions={"have_words": 0}, table="breakouts", fetchall=False) if res is not None: id_, text_path = res conn.update(table="breakouts", settings={"have_words": 1}, conditions={"id": id_}) task_args["lock"].release() try: feature_words = tags_extractor(open(text_path).read(), topk=10, w2v_model=w2v_model) conn.insert(table="breakouts_feature_words_c3", settings={ "id": id_, "feature_words": " ".join(feature_words) }) # print("[Process-{}] id: {}, feature_words: {}".format(pid, id_, feature_words)) except: conn.update(table="breakouts", settings={"have_words": 0}, conditions={"id": id_}) print(id_) print(traceback.format_exc()) break else: task_args["lock"].release() break
def filtered_feature_words(): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c3.mod") rubbish = open("../resources/rubbish_words.txt").read().splitlines() with open("../resources/feature_words_counter_d.pkl", "rb") as f: feature_words_counter_d = pickle.load(f) for id_, text in conn.query(targets=["id", "feature_words"], table="breakouts_feature_words_c3"): # 过滤掉rubbish_words feature_words = text.split() filtered_feature_words = [] for w in feature_words: if w not in rubbish: filtered_feature_words.append(w) # conn.update(table="breakouts_feature_words_c3", # settings={"filtered_feature_words": " ".join(filtered_feature_words)}, # conditions={"id": id_}) # 基于w2v模型合并意思相近的词 simi_matrix = [] size = len(filtered_feature_words) for i in range(size): tmp = [] for j in range(size): w1, w2 = filtered_feature_words[i], filtered_feature_words[j] tmp.append(w2v_model.wv.similarity(w1, w2)) simi_matrix.append(tmp) selected = [0] * size groups = [] for i in range(size): if selected[i] == 1: continue selected[i] = 1 group_wi = [i] for j in range(size): if selected[j] != 1: for wi in group_wi: if simi_matrix[wi][j] >= 0.55: group_wi.append(j) selected[j] = 1 break group = [filtered_feature_words[wi] for wi in group_wi] groups.append(group) # print(groups) clean_feature_words = [] for g in groups: if len(g) > 1: clean_feature_words.append(g[np.argmax( [feature_words_counter_d[w] for w in g])]) else: clean_feature_words.append(g[0]) # print(clean_feature_words) conn.update( table="breakouts_feature_words_c3", settings={"clean_feature_words": " ".join(clean_feature_words)}, conditions={"id": id_})