def getListPages(pageNo): ''' 起始页包含了若干需要抓取里面内容的列表项 每个列表项打开后是若干url列表,每个url指向的页面是抓取目标 ''' requestUrl = 'http://www.zjcredit.gov.cn:8000/ListPrompt.aspx' head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding:gzip,deflate,sdch', 'Accept-Language:zh-CN,zh;q=0.8', 'Cache-Control:max-age=0', 'Connection:keep-alive', 'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81', 'Host:www.zjcredit.gov.cn:8000', 'Pragma:no-cache', 'Origin:http://www.zjcredit.gov.cn:8000', 'Referer:http://www.zjcredit.gov.cn:8000/ListPrompt.aspx', 'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36' ] sectionID = '01' post_data_dic = {'sectionID': sectionID, 'pageNo': pageNo, 'pageLength': 20, 'organizationCode': '', 'searchInfo': ''} proxy = '218.108.170.173:82' proxy = '218.108.170.170:80' # content = jTool.fetchUrlProxy2(proxy, requestUrl, post_data_dic, 'post', head) content = open('11.html').read() if content: exp = '/body/div/table/tr[1]/td[1]/table[1]' aList = jTool.getNodes(content, exp) print aList
def getEntBase(content): ''' 根据企业信用记录网页内容获取企业的基本信息 以字段list方式返回 ''' exp = "//td[@id='xyddj']" fields = None try: tdNode = jTool.getNodes(content, exp) tbNode = tdNode[0].xpath('parent::tr/parent::table') trNode = tbNode[0].xpath('child::tr') fields = getEntBaseFields(trNode) except Exception, e: print 'Parse ent base info error:', __name__, e
def getPageFields(content): ''' 携带参数发起请求,获得最终页面信息 截取需要的内容,存入数据库 ''' if not content: return None result = {} exp = '//table[1]/tr[3]/td[1]/div[1]/text()' exp2 = '//table[1]/tr[3]/td[1]/table[1]' try: result['corpName'] = jTool.getNodes(content, exp)[0].strip() except Exception, e: print '得到页面非企业信息页面', e return None
def getEntBase(content): ''' 根据企业信用记录网页内容获取企业的基本信息 以字段list方式返回 ''' exp = "//td[@id='xyddj']" fields = None # try: tdNode = jTool.getNodes(content, exp) tbNode = tdNode[0].xpath('parent::tr/parent::table') trNode = tbNode[0].xpath('child::tr') fields = getEntBaseFields(trNode) # except: # pass # finally: return fields
def getDetailListPages(pageNo, conn): ''' 最终目标页面的列表项 每个列表项打开后是若干url列表,每个url指向的页面是抓取目标 ''' # requestUrl = 'http://218.108.28.28:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False' requestUrl = 'http://218.108.28.28:8000/ListPrompts.aspx' head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding:gzip,deflate,sdch', 'Accept-Language:zh-CN,zh;q=0.8', 'Cache-Control:max-age=0', 'Connection:keep-alive', 'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81', 'Host:www.zjcredit.gov.cn:8000', 'Pragma:no-cache', 'Origin:http://www.zjcredit.gov.cn:8000', 'Referer:http://218.108.28.28:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False', 'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36' ] # associateID = '00000000000000000' # field_CorporationName = '' # field_OrganizationCode = '' # isIntermediary = 'False' # pageLength = 20 # recordTotal = 46358 # sectionID = '01' # tableID = 'CourtNotCarryOut' post_data_dic = {'recordTotal': 46358, 'tableID': 'CourtNotCarryOut', 'associateID': '00000000000000000', 'field_CorporationName': '', 'sectionID': '01', 'field_OrganizationCode': '', 'isIntermediary': 'False', 'pageNo': pageNo, 'pageLength': 20} proxy = '218.108.170.173:82' proxy = '218.108.170.170:80' content = jTool.fetchUrlProxy2(proxy, requestUrl, post_data_dic, 'post', head) # jTool.logit(content, '22.html') # content = open('22.html').read() if content: exp = '//table[1]/tr[2]/td[1]/table[1]/tr/td/a' aList = jTool.getNodes(content, exp) # import lxml.etree as ETree # print aList[1].xpath('//@title')[0] titles = aList[1].xpath('//@title') corpRecords = {} # print aList[1].xpath('//@onclick')[20] aStr = aList[1].xpath('//@onclick') cursor = conn.cursor() from urllib import unquote for i in range(20): # corpRecords['`corpName`'] = (unquote(titles[i])).decode('utf-8') corpRecords['`corpName`'] = titles[i].decode('utf-8').encode('utf-8') # print corpRecords['`corpName`'] # print unquote(corpRecords['`corpName`']) # print aStr[i+1] tmp = aStr[i+1].split(',') # print i+1 corpRecords['`table`'] = tmp[0].split("'")[1] corpRecords['`rowID`'] = tmp[5].split("'")[1] # print corpRecords['rowID'] # print corpRecords['corpName'] corpRecords['`pageNo`'] = str(pageNo) result = jTool.insertData(cursor, 'base_page_list', corpRecords) conn.commit() cursor.close() conn.close()
def getPageFields(content): ''' 携带参数发起请求,获得最终页面信息 截取需要的内容,存入数据库 ''' if not content: return None result = {} exp = '//table[1]/tr[3]/td[1]/div[1]/text()' exp2 = '//table[1]/tr[3]/td[1]/table[1]' try: result['corpName'] = jTool.getNodes(content, exp)[0].strip() except Exception, e: print '得到页面非企业信息页面', e return None corpTable = jTool.getNodes(content, exp2)[0] is_15 = len(corpTable.xpath('//tr[15]')) is_13 = len(corpTable.xpath('//tr[13]')) is_12 = len(corpTable.xpath('//tr[12]')) print is_15, is_13, is_12 # print result['corpName'] # result['corpNameTable'] = corpTable.xpath('//tr[1]/td[2]/text()')[0] # print result['corpNameTable'] result['orgCode'] = corpTable.xpath('//tr[2]/td[2]//text()')[0] # print result['orgCode'] result['legacyPerson'] = corpTable.xpath('//tr[3]/td[2]//text()')[0] # print result['legacyPerson'] if is_13>0: result['address'] = corpTable.xpath('//tr[4]/td[2]//text()')[0] # print result['address']