Ejemplo n.º 1
0
def operater(pageQueue,resultQueue):
    while True:

        data = pageQueue.get(block = True)
        if not data:
            continue

        playerid = '/'.join(tool.getplayerid(data[0]))
        htmltext = data[1]
#        print htmltext,"at:operater"

        html = formatHTML(htmltext)

        hc = MyHTMLParser()
        hc.feed(html)

        playerinfo = hc.get_playerinfo()
        career = hc.get_career()

        if playerinfo:
            playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid)
        if career:
            career = tool.addkeytodict(career,'player_id',playerid)

        if playerinfo:
            resultQueue.put((playerinfo,career),block = True)
            print "resultQueue have more record!"
            print playerinfo
            print career

        print "I am working ,boss."
Ejemplo n.º 2
0
def data_to_node(dom,data):
    #Format the HTML page
#    print data
    data = formatHTML(data)
    print 'data to node'
#    print data

    #Parse html page and get information;
    hc = html_parser.MyHTMLParser()
    hc.feed(data)
    hc.close
    playerinfo = hc.get_playerinfo()
    career = hc.get_career()


    xmlt = xmltree(dom,playerinfo,career).dom
Ejemplo n.º 3
0
def operater(pageQueue,resultQueue):

    data = pageQueue.get(block = True)
    playerid = '/'.join(tool.getplayerid(data[0]))
    htmltext = data[1]

    html = formatHTML(html)

    hc = MyHTMLParser()
    hc.feed(html)

    playerinfo = hc.get_playerinfo()
    career = hc.get_career()

    playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid)
    career = tool.addkeytodict(career,'player_id',playerid)

    resultQueue.put((playerinfo,career),block = True)
Ejemplo n.º 4
0
                        "yellow card 2rd":"-1",\
                        "red card":"-1"\
                    }
        return car_record


if __name__ == "__main__":

    from guolv import formatHTML


    #处理html文件,格式化
    path = r"C:\Documents and Settings\qi.he.BJ-850INTER290\My Documents\test.html"
    html_code=open(path,'r')
    data = html_code.read()
    data = formatHTML(data)
    print data

    #解析,提取数据
    hc = MyHTMLParser()
    hc.feed(data)
    hc.close

    print hc.get_career()
    print hc.get_playerinfo()

#    from convertPlayerinfo  import convertPlayerinfo
#    print convertPlayerinfo(hc.playerinfo)
    print 'end'

Ejemplo n.º 5
0
for name in filelist:
    print name
    fd = open(name,'r')
    data = filesplit(fd,'@newpage@')
    print len(data)

    for page in data[1:]:
        print type(page),len(page)
        if not tool.isplayerpage(page[0]):
            continue
        else:
            playerid = '/'.join(tool.getplayerid(page[0]))

            html = ''.join(page[1:])
            html = formatHTML(html)

            hc = MyHTMLParser()
            hc.feed(html)
            playerinfo = hc.get_playerinfo()
            career = hc.get_career()
            playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid)
            career = tool.addkeytodict(career,'player_id',playerid)
            print playerinfo

            #insert record
            tool.insertplayerinfo(qiud_cur,playerinfo)

            for car_record in career:
                tool.insertcareer(qiud_cur,car_record)
            qiud_conn.commit()