import logging, gensim, bz2 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim import corpora, models, similarities import sys import numpy as np import MySQLdb as mysql import json sys.path.append("/Users/Fukuball/localhost/lyrics-match/p-library/model") import ImportPath ImportPath.Import() import db_stage CONST = db_stage._Const() # connect to db db = mysql.connect(host = CONST.DBHOST, user = CONST.DBUSER, passwd = CONST.DBPASS, db = CONST.DBNAME, charset = 'UTF8') cur = db.cursor() cur.execute("SET NAMES UTF8") cur.execute("SET CHARACTER_SET_CLIENT=UTF8") cur.execute("SET CHARACTER_SET_RESULTS=UTF8") db.commit() song_id = sys.argv[1]; # load id->word mapping (the dictionary), one of the results of step 2 above id2word = gensim.corpora.Dictionary.load_from_text('20120924_lyrics_wordids_ch.txt')
def process(self, songId): feature = {"word_count": [], "pinyin": [], "pos": [], "tone": []} import MySQLdb as mysql import MySQLdb.cursors as cursors import sys from InputProcess import Tone2Pitch sys.path.append("www/html/lyrics-match/p-config") import db_stage """ DB 連線設定 """ CONST = db_stage._Const() conn = mysql.connect(host = CONST.DBHOST, user = CONST.DBUSER, passwd = CONST.DBPASS, db = CONST.DBNAME, charset = 'UTF8') dictCursor = conn.cursor(cursorclass = cursors.DictCursor) baseCursor = conn.cursor() dictCursor.execute("SELECT * FROM lyrics_consonant_mapping") conList = list(dictCursor.fetchall()) dictCursor.execute("SELECT * FROM lyrics_vowel_mapping") vowelList = list(dictCursor.fetchall()) dictCursor.execute("SELECT * FROM lyrics_tone_mapping") toneList = list(dictCursor.fetchall()) dictCursor.execute("SELECT pos FROM lyrics_pos_mapping") posList = list(dictCursor.fetchall()) posList = map(lambda pos: pos["pos"], posList) dictCursor.execute("SELECT line, offset, length FROM lyrics_line WHERE song_id = %d ORDER BY offset ASC" % songId) lineList = list(dictCursor.fetchall()) for line in lineList: sql = "SELECT offset, length FROM lyrics_sentence WHERE song_id = %d AND offset >= %d AND offset < %d ORDER BY offset ASC" % \ (songId, line["offset"], line["offset"] + line["length"]) dictCursor.execute(sql) sentenceList = list(dictCursor.fetchall()) wordCountLine = [] pinyinLine = [] toneLine = [] posLine = [] for sentence in sentenceList: # 選取 Word Table sql = "SELECT word, consonant, vowel, tone FROM lyrics_word WHERE \ song_id = %d AND offset >= %d AND offset < %d ORDER BY offset ASC" % \ (songId, sentence["offset"], sentence["offset"] + sentence["length"]) dictCursor.execute(sql) # 裡面確定都是中文字 wordList = list(dictCursor.fetchall()) # 單句字數 wordCountLine.append(len(wordList)) wordCountLine.append('') #空字串表示分隔符號 # 拼音 for word in wordList: # 將聲母、韻母轉換成對應的 id #print "|" + word["word"] + "|" + word["consonant"] + "|" + word["vowel"] + "|" #conId = [con["id"] for con in conList if con["consonant"] == word["consonant"]][0] matchId = [con["id"] for con in conList if con["consonant"] == word["consonant"]] #print "conid", matchId conId = matchId[0] #vowelId = [vowel["id"] for vowel in vowelList if vowel["vowel"] == word["vowel"]][0] matchId = [vowel["id"] for vowel in vowelList if vowel["vowel"] == word["vowel"]] vowelId = matchId[0] pinyinLine.append((conId, vowelId)) #print pinyinLine.append('') #空字串表示分隔符號 # 聲調,不需要分隔符號 for word in wordList: toneId = [tone["id"] for tone in toneList if tone["tone"] == word["tone"]][0] toneLine.append(toneId) # 選取 Term Table sql = "SELECT pos FROM lyrics_term WHERE \ song_id = %d AND offset >= %d AND offset < %d ORDER BY offset ASC" % \ (songId, sentence["offset"], sentence["offset"] + sentence["length"]) dictCursor.execute(sql) termList = list(dictCursor.fetchall()) # 詞性 for term in termList: # 確定詞性有出現在 lyrics_pos_mapping 中 if term["pos"] in posList: posLine.append(term["pos"]) posLine.append('') feature["word_count"].append(wordCountLine) feature["pinyin"].append(pinyinLine) feature["tone"].append(toneLine) feature["pos"].append(posLine) Tone2Pitch().process(feature["tone"]) dictCursor.close() conn.close() return feature