def operater(pageQueue,resultQueue): while True: data = pageQueue.get(block = True) if not data: continue playerid = '/'.join(tool.getplayerid(data[0])) htmltext = data[1] # print htmltext,"at:operater" html = formatHTML(htmltext) hc = MyHTMLParser() hc.feed(html) playerinfo = hc.get_playerinfo() career = hc.get_career() if playerinfo: playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid) if career: career = tool.addkeytodict(career,'player_id',playerid) if playerinfo: resultQueue.put((playerinfo,career),block = True) print "resultQueue have more record!" print playerinfo print career print "I am working ,boss."
def data_to_node(dom,data): #Format the HTML page # print data data = formatHTML(data) print 'data to node' # print data #Parse html page and get information; hc = html_parser.MyHTMLParser() hc.feed(data) hc.close playerinfo = hc.get_playerinfo() career = hc.get_career() xmlt = xmltree(dom,playerinfo,career).dom
def operater(pageQueue,resultQueue): data = pageQueue.get(block = True) playerid = '/'.join(tool.getplayerid(data[0])) htmltext = data[1] html = formatHTML(html) hc = MyHTMLParser() hc.feed(html) playerinfo = hc.get_playerinfo() career = hc.get_career() playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid) career = tool.addkeytodict(career,'player_id',playerid) resultQueue.put((playerinfo,career),block = True)
"yellow card 2rd":"-1",\ "red card":"-1"\ } return car_record if __name__ == "__main__": from guolv import formatHTML #处理html文件,格式化 path = r"C:\Documents and Settings\qi.he.BJ-850INTER290\My Documents\test.html" html_code=open(path,'r') data = html_code.read() data = formatHTML(data) print data #解析,提取数据 hc = MyHTMLParser() hc.feed(data) hc.close print hc.get_career() print hc.get_playerinfo() # from convertPlayerinfo import convertPlayerinfo # print convertPlayerinfo(hc.playerinfo) print 'end'
for name in filelist: print name fd = open(name,'r') data = filesplit(fd,'@newpage@') print len(data) for page in data[1:]: print type(page),len(page) if not tool.isplayerpage(page[0]): continue else: playerid = '/'.join(tool.getplayerid(page[0])) html = ''.join(page[1:]) html = formatHTML(html) hc = MyHTMLParser() hc.feed(html) playerinfo = hc.get_playerinfo() career = hc.get_career() playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid) career = tool.addkeytodict(career,'player_id',playerid) print playerinfo #insert record tool.insertplayerinfo(qiud_cur,playerinfo) for car_record in career: tool.insertcareer(qiud_cur,car_record) qiud_conn.commit()