def create_artists_table(): ''' 在database中创建表格artists,包含id, nid, name。 ''' read_path = "/Volumes/nmusic/NetEase2020/data/simple_proxied_tracks_details" artists_set = set() conn = MyConn() for root, dirs, files in os.walk(read_path): for file in files: if "DS" in file: continue filepath = os.path.join(root, file) with open(filepath) as f: content = json.load(f) try: for ar in content["songs"][0]["ar"]: artists_set.add((ar["id"], ar["name"])) except KeyboardInterrupt: print("interrupted by keyboard.") sys.exit(0) except Exception as e: print(filepath, e) print(len(artists_set)) for ar in artists_set: conn.insert(table="artists", settings={"nid":ar[0], "name":ar[1]})
def task(pid, task_args): conn = MyConn() w2v_model = Word2Vec.load("../models/w2v/c4.mod") while 1: task_args["lock"].acquire() res = conn.query(targets=["id", "text_path"], conditions={"have_words": 0}, table="breakouts", fetchall=False) if res is not None: id_, text_path = res conn.update(table="breakouts", settings={"have_words": 1}, conditions={"id": id_}) task_args["lock"].release() try: feature_words = tags_extractor(open(text_path).read(), topk=10, w2v_model=w2v_model) conn.insert(table="breakouts_feature_words_c3", settings={ "id": id_, "feature_words": " ".join(feature_words) }) # print("[Process-{}] id: {}, feature_words: {}".format(pid, id_, feature_words)) except: conn.update(table="breakouts", settings={"have_words": 0}, conditions={"id": id_}) print(id_) print(traceback.format_exc()) break else: task_args["lock"].release() break
def create_subtracks_table(): ''' 创建sub_tracks表格。 歌曲筛选条件: + 拥有mp3_path,lyrics_path,json_path(reviews) + 对于有爆发点的歌,要求爆发点不属于fake,capital_drive,release_drive,爆发点的reviews_num>=100,beta>=50 + 拥有artist_vec ''' # 读取artists的向量表示 with open("../data/artists_vec_dict.pkl", "rb") as f: d_artist_vec = pickle.load(f) conn = MyConn() # 拥有mp3_path,lyrics_path,json_path(reviews) data = conn.query( sql= "SELECT track_id, bnum, mp3_path, lyrics_path, json_path FROM tracks WHERE\ bnum IS NOT NULL and mp3_path IS NOT NULL and lyrics_path IS NOT NULL and json_path IS NOT NULL" ) # 要求爆发点不属于fake,capital_drive,release_drive,爆发点的reviews_num>=100,beta>=50 # d_track_valid_bnum记录歌曲中valid_breakouts的数量 targets = ("id", "track_id", "beta", "reviews_num", "release_drive", "capital_drive", "fake") breakouts = conn.query(table="breakouts", targets=targets) d_track_valid_bnum = {} for b in breakouts: d_tmp = dict(zip(targets, b)) if d_tmp["beta"]>=50 and d_tmp["reviews_num"]>=100 and \ d_tmp["release_drive"]+d_tmp["capital_drive"]+d_tmp["fake"]==0: tid = d_tmp["track_id"] if tid in d_track_valid_bnum: d_track_valid_bnum[tid] += 1 else: d_track_valid_bnum[tid] = 1 new_data = [] for item in data: track_id, bnum = item[0], item[1] # 用valid_breakouts筛选 valid_bnum = 0 if bnum > 0: if track_id not in d_track_valid_bnum: continue valid_bnum = d_track_valid_bnum[track_id] # 用artist_vec筛选 valid_artist = None artists = conn.query(table="details", targets=["artists"], conditions={"track_id": track_id}, fetchall=False) if artists: artists = artists[0].split(',') for ar in artists: if ar.lower().strip() in d_artist_vec: valid_artist = ar.lower().strip() break if not valid_artist: continue new_data.append( [track_id, valid_bnum, valid_artist, item[2], item[3], item[4]]) # 提交至数据库 columns = ("track_id", "valid_bnum", "artist", "mp3_path", "lyrics_path", "json_path") for item in new_data: conn.insert(table="sub_tracks", settings=dict(zip(columns, item)))