Python getNodes Beispiele, jTool.getNodes Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: getMore.py Projekt: jhfnetboy/proxyCrawler

def getListPages(pageNo):
    '''
    起始页包含了若干需要抓取里面内容的列表项
    每个列表项打开后是若干url列表，每个url指向的页面是抓取目标
    '''    
    requestUrl = 'http://www.zjcredit.gov.cn:8000/ListPrompt.aspx'
    head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3',
                    'Accept-Encoding:gzip,deflate,sdch',
                    'Accept-Language:zh-CN,zh;q=0.8',
                    'Cache-Control:max-age=0',
                    'Connection:keep-alive',
                    'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81',
                    'Host:www.zjcredit.gov.cn:8000',
                    'Pragma:no-cache',
                    'Origin:http://www.zjcredit.gov.cn:8000',
                    'Referer:http://www.zjcredit.gov.cn:8000/ListPrompt.aspx',
                    'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36'
                    ]
    sectionID = '01'
    post_data_dic = {'sectionID': sectionID, 'pageNo': pageNo, 'pageLength': 20, 'organizationCode': '', 'searchInfo': ''}
    proxy = '218.108.170.173:82'
    proxy = '218.108.170.170:80'
#    content = jTool.fetchUrlProxy2(proxy, requestUrl, post_data_dic, 'post', head)
    content = open('11.html').read()
    if content:
        exp = '/body/div/table/tr[1]/td[1]/table[1]'
        aList = jTool.getNodes(content, exp)
        print aList

Beispiel #2

0

Datei anzeigen

Datei: rp.py Projekt: jhfnetboy/proxyCrawler

def getEntBase(content):
    '''
    根据企业信用记录网页内容获取企业的基本信息
    以字段list方式返回
    '''
    exp = "//td[@id='xyddj']"
    fields = None
    try:
        tdNode = jTool.getNodes(content, exp)
        tbNode = tdNode[0].xpath('parent::tr/parent::table')
        trNode = tbNode[0].xpath('child::tr')
        fields = getEntBaseFields(trNode)
    except Exception, e:
        print 'Parse ent base info error:', __name__, e

Beispiel #3

0

Datei anzeigen

Datei: getMore.py Projekt: jhfnetboy/proxyCrawler

def getPageFields(content):
    '''
    携带参数发起请求，获得最终页面信息
    截取需要的内容，存入数据库
    '''
    if not content:
        return None
    result = {}
    exp = '//table[1]/tr[3]/td[1]/div[1]/text()'
    exp2 = '//table[1]/tr[3]/td[1]/table[1]'
    try:
        result['corpName'] = jTool.getNodes(content, exp)[0].strip()
    except Exception, e:
        print '得到页面非企业信息页面', e
        return None

Beispiel #4

0

Datei anzeigen

Datei: rp.py Projekt: jhfnetboy/proxyCrawler

def getEntBase(content):
    '''
    根据企业信用记录网页内容获取企业的基本信息
    以字段list方式返回
    '''
    exp = "//td[@id='xyddj']"
    fields = None
#    try:
    tdNode = jTool.getNodes(content, exp)
    tbNode = tdNode[0].xpath('parent::tr/parent::table')
    trNode = tbNode[0].xpath('child::tr')
    fields = getEntBaseFields(trNode)
#    except:
#        pass
#    finally:
    return fields

Beispiel #5

0

Datei anzeigen

Datei: getMore.py Projekt: jhfnetboy/proxyCrawler

def getDetailListPages(pageNo, conn):
    '''
    最终目标页面的列表项
    每个列表项打开后是若干url列表，每个url指向的页面是抓取目标
    '''    
#    requestUrl = 'http://218.108.28.28:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False'
    requestUrl = 'http://218.108.28.28:8000/ListPrompts.aspx'
    head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3',
                    'Accept-Encoding:gzip,deflate,sdch',
                    'Accept-Language:zh-CN,zh;q=0.8',
                    'Cache-Control:max-age=0',
                    'Connection:keep-alive',
                    'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81',
                    'Host:www.zjcredit.gov.cn:8000',
                    'Pragma:no-cache',
                    'Origin:http://www.zjcredit.gov.cn:8000',
                    'Referer:http://218.108.28.28:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False',
                    'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36'
                    ]
#    associateID = '00000000000000000'
#    field_CorporationName = ''
#    field_OrganizationCode = ''
#    isIntermediary = 'False'
#    pageLength = 20
#    recordTotal = 46358
#    sectionID = '01'
#    tableID = 'CourtNotCarryOut'
    post_data_dic = {'recordTotal': 46358, 'tableID': 'CourtNotCarryOut', 'associateID': '00000000000000000', 'field_CorporationName': '', 'sectionID': '01', 'field_OrganizationCode': '', 'isIntermediary': 'False', 'pageNo': pageNo, 'pageLength': 20}
    proxy = '218.108.170.173:82'
    proxy = '218.108.170.170:80'
    content = jTool.fetchUrlProxy2(proxy, requestUrl, post_data_dic, 'post', head)
#    jTool.logit(content, '22.html')
#    content = open('22.html').read()
    if content:
        exp = '//table[1]/tr[2]/td[1]/table[1]/tr/td/a'
        aList = jTool.getNodes(content, exp)
#        import lxml.etree as ETree
#        print aList[1].xpath('//@title')[0]
        titles = aList[1].xpath('//@title')
        corpRecords = {}
#        print aList[1].xpath('//@onclick')[20]
        aStr = aList[1].xpath('//@onclick')
        cursor = conn.cursor()
        from urllib import unquote
        for i in range(20):
#            corpRecords['`corpName`'] = (unquote(titles[i])).decode('utf-8')
            corpRecords['`corpName`'] = titles[i].decode('utf-8').encode('utf-8')
#            print corpRecords['`corpName`']
#            print unquote(corpRecords['`corpName`'])
#            print aStr[i+1]
            tmp = aStr[i+1].split(',')
#            print i+1
            corpRecords['`table`'] = tmp[0].split("'")[1]
            corpRecords['`rowID`'] = tmp[5].split("'")[1]
#            print corpRecords['rowID']
#            print corpRecords['corpName']
            corpRecords['`pageNo`'] = str(pageNo)
            result = jTool.insertData(cursor, 'base_page_list', corpRecords)
            conn.commit()
        cursor.close()
        conn.close()

Beispiel #6

0

Datei anzeigen

Datei: getMore.py Projekt: jhfnetboy/proxyCrawler

def getPageFields(content):
    '''
    携带参数发起请求，获得最终页面信息
    截取需要的内容，存入数据库
    '''
    if not content:
        return None
    result = {}
    exp = '//table[1]/tr[3]/td[1]/div[1]/text()'
    exp2 = '//table[1]/tr[3]/td[1]/table[1]'
    try:
        result['corpName'] = jTool.getNodes(content, exp)[0].strip()
    except Exception, e:
        print '得到页面非企业信息页面', e
        return None
    corpTable = jTool.getNodes(content, exp2)[0]
    is_15 = len(corpTable.xpath('//tr[15]'))
    is_13 = len(corpTable.xpath('//tr[13]'))
    is_12 = len(corpTable.xpath('//tr[12]'))
    print is_15, is_13, is_12
#    print result['corpName']
#    result['corpNameTable'] = corpTable.xpath('//tr[1]/td[2]/text()')[0]
#    print result['corpNameTable']
    result['orgCode'] = corpTable.xpath('//tr[2]/td[2]//text()')[0]
#    print result['orgCode']
    result['legacyPerson'] = corpTable.xpath('//tr[3]/td[2]//text()')[0]
#    print result['legacyPerson']

    if is_13>0:
        result['address'] = corpTable.xpath('//tr[4]/td[2]//text()')[0]
    #    print result['address']