def crawlUrl(conn, param, proxyList): ''' 访问url并获得企业基本信息和信用记录并入库 ''' cursor2 = conn.cursor() cursor3 = conn.cursor() pcount = len(proxyList)-1 proxy = proxyList[random.randint(0,pcount)] content = jTool.getContentByProxy(proxy, param['url']) if not content: jTool.logError('Fail to get page content, url:'+ param['url']) return False dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录" post_data_dic = {'corpName': param['ename'], 'creditID': param['eid'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'} recEntBaseDic = getEntBase(content) if not recEntBaseDic: return False recEntBaseDic['url'] = param['url'] recEntBaseDic['eid'] = param['eid'] recEntBaseDic['enterprise_name'] = param['ename'].strip('') recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) try: jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic) conn.commit() print ' Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) except: print 'Fail to insert record '+ ', eid is '+str(param['eid']) jTool.logError('Fail to insert record on '+ 'url is '+param['url']+', id is '+str(id)) return False recEntDetailDic = {} recEntDetailDic['url'] = param['url'] recEntDetailDic['eid'] = param['eid'] recEntDetailDic['enterprise_name'] = param['ename'].strip() recEntDetailDic['records'] = ' ' recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # detailRecUrl = param['basePostUrl'] detailRecUrl = param['basePostUrlip'] contentRecord = jTool.getContentByProxy(proxy, detailRecUrl, post_data_dic) if not contentRecord: return True recList = getEntDetail(contentRecord) scount = 0 for rec in recList: scount += 1 rec = jTool.clearX(["'"], rec) recEntDetailDic['content'] = str(rec) jTool.insertData(cursor3, 'enterprise_record_raw', recEntDetailDic) conn.commit() print ' Insert enterprise detail info No. '+str(scount)+' successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) print 'Fetch and insert successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) return True
def getPageField(conn, proxy, head, id, rowID, corpName): cursor = conn.cursor() cursor2 = conn.cursor() content = getDetailPageContent(proxy, head, rowID, corpName) # content = open('4.html').read() # print content if not content: return None from urllib import quote # from urllib import unquote jTool.logit(content, '4.txt') # pageContentDic = {'content': quote(content)} # jTool.updateData(cursor2, ' where id = '+str(id)+' ', 'base_page_list', pageContentDic) # try: resultDic = getPageFields(content) if not resultDic: return None resultDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) resultDic['iid'] = str(id) rt = jTool.insertData(cursor, 'courtDetail', resultDic) return rt # except Exception, e: # print 'getPageField error', e # return None conn.commit() cursor.close() cursor2.close() conn.close()
def crawlGetUrl(conn, param, proxy): ''' 以get方式获得企业基本信息并添加到enterprise_raw表 ''' cursor = conn.cursor() cursor2 = conn.cursor() cursor4 = conn.cursor() recordExists = jTool.notExsitsRecord(cursor4, 'enterprise_raw', 'eid', param['id']) cursor4.close() if recordExists: # print param['ename']+'基本信息已存在,eid'+str(param['eid']) jTool.updateData(cursor, ' where id = '+str(param['iid'])+' limit 1', 'item_url_task', {'status': '1'}) cursor.close() conn.commit() return {'logic': True, 'rtData': recordExists} if not recordExists: # print 'get url:'+param['url']+', '+param['ename'].strip()+',id:'+param['id']+', proxy:'+proxy content = jTool.getContentByProxy(proxy, param['url']) if not content: print '获取基本信息页面内容为空或失败' return {'logic': False, 'rtData': {'type': 'getPageError', 'postPage': ' '}} recEntBaseDic = None if content: try: recEntBaseDic = getEntBase(content) except Exception, e: print '解析企业基本信息页面失败', __name__, e return {'logic': False, 'rtData': {'type': 'parsePageBaseError', 'postPage': ' '}} if not recEntBaseDic: return {'logic': False, 'rtData': {'type': 'parsePageBaseError', 'postPage': ' '}} recEntBaseDic['url'] = param['url'] recEntBaseDic['eid'] = param['id'] recEntBaseDic['postContent'] = ' ' recEntBaseDic['enterprise_name'] = param['ename'].strip('') recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) insertId = None try: jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic) insertId = conn.insert_id() conn.commit() print str(param['id'])+' Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) jTool.updateData(cursor, ' where id = '+str(param['iid'])+' limit 1', 'item_url_task', {'status': '1'}) return {'logic': True, 'rtData': insertId} except Exception, e: print 'Fail to insert baseinfo record '+ ', id is '+str(param['id'])+', proxy:'+proxy, __name__, e print recEntBaseDic return {'logic': False, 'rtData': {'type': 'insertPageError', 'postPage': ' '}}
def crawlUrl(conn, param, proxy): ''' 访问url并获得企业基本信息和信用记录并入库 ''' cursor2 = conn.cursor() cursor4 = conn.cursor() cursor5 = conn.cursor() cursor6 = conn.cursor() recordExists = jTool.notExsitsRecord(cursor4, 'enterprise_raw', 'eid', param['id']) cursor4.close() if recordExists: print param['ename']+'基本信息已存在,尝试获取记录信息 \npage url:'+param['url']+', proxy:'+proxy if not recordExists: print 'get url:'+param['url']+', '+param['ename'].strip()+',id:'+param['id']+', proxy:'+proxy content = jTool.getContentByProxy(proxy, param['url']) if not content: print '获取基本信息页面内容为空或失败' return {'logic': False, 'rtData': {'type': 'getPageError', 'postContent': ' '}} recEntBaseDic = None if content: try: recEntBaseDic = getEntBase(content) except Exception, e: print '解析企业基本信息页面失败', __name__, e return {'logic': False, 'rtData': {'type': 'parsePageBaseError', 'postContent': ' '}} if not recEntBaseDic: return {'logic': False, 'rtData': {'type': 'parsePageBaseError', 'postContent': ' '}} recEntBaseDic['url'] = param['url'] recEntBaseDic['eid'] = param['id'] recEntBaseDic['postContent'] = ' ' recEntBaseDic['enterprise_name'] = param['ename'].strip('') recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) try: jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic) conn.commit() print ' Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) except Exception, e: print 'insert error ', __name__, e print 'Fail to insert record '+ ', id is '+str(param['id'])+', proxy:'+proxy return {'logic': False, 'rtData': {'type': 'insertPageError', 'postContent': ' '}}
def extractPostContent(conn, table, start, end): ''' 提取enterprise_raw表中的postContent字段到enterprse_record_raw中 每条包括多种信用记录,每种信用记录包括多条 ''' cursor = conn.cursor() cursor2 = conn.cursor() id = int(start) while id>=int(start) and id<=int(end): print '*'*30 sql = 'select eid, enterprise_name, url, postContent, id from '+str(table)+' where id ='+str(id) id += 1 cursor.execute(sql) record = cursor.fetchone() if not record: continue recList = rp.getEntDetail(record[3]) scount = 0 recEntDetailDic = {} recEntDetailDic['url'] = record[2] recEntDetailDic['eid'] = record[0] recEntDetailDic['enterprise_name'] = record[1].strip() print 'id:'+str(record[4])+', '+recEntDetailDic['enterprise_name'] for rec in recList: scount += 1 rDic = contentToRecords(str(rec).strip()) if not rDic: continue rec = jTool.clearX(["'"], rec) recEntDetailDic['content'] = (str(rec)).strip() recEntDetailDic['records'] = ' ' recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) fDic = dict(recEntDetailDic, **rDic) jTool.insertData(cursor2, 'enterprise_record_raw', fDic) conn.commit() cursor.close() cursor2.close() conn.close()
def crawlUrl(conn, param, proxy): ''' 访问url并获得企业基本信息和信用记录并入库 ''' cursor2 = conn.cursor() cursor4 = conn.cursor() cursor5 = conn.cursor() cursor6 = conn.cursor() recordExists = jTool.notExsitsRecord(cursor4, 'enterprise_raw', 'eid', param['id']) cursor4.close() if recordExists: print '企业'+param['ename']+'基本信息已存在,直接尝试获取记录信息 \npage url:'+param['url'] if not recordExists: print 'get url:'+param['url']+', '+param['ename'].strip()+',id:'+param['id'] content = jTool.getContentByProxy(proxy, param['url']) # jTool.logit(str(content), 'errorPage.txt') if not content: print '获取基本信息页面内容为空或失败' jTool.logError('\nGet page content fail, method:get, ename:'+param['ename']+',id:'+param['id']+',url:'+ param['url']) return False recEntBaseDic = None if content: try: recEntBaseDic = getEntBase(content) except: print '解析企业基本信息页面失败' jTool.logError('\nParse page content fail, method:get, ename:'+param['ename']+',id:'+param['id']+',url:'+ param['url']) return False if not recEntBaseDic: return False recEntBaseDic['url'] = param['url'] recEntBaseDic['eid'] = param['id'] recEntBaseDic['postContent'] = ' ' recEntBaseDic['enterprise_name'] = param['ename'].strip('') recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) try: jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic) conn.commit() print ' Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) except: print 'Fail to insert record '+ ', id is '+str(param['id']) jTool.logError('Fail to insert record on '+ 'url is '+param['url']+', id is '+str(id)) return False dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录" post_data_dic = {'corpName': param['ename'], 'creditID': param['id'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'} recEntDetailDic = {} recEntDetailDic['url'] = param['url'] recEntDetailDic['eid'] = param['id'] recEntDetailDic['enterprise_name'] = param['ename'].strip() recEntDetailDic['records'] = ' ' recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) contentRecord = None jTool.getField(cursor6, 'enterprise_raw', 'postContent', ' where eid = '+str(param['id'])) postContent =cursor6.fetchone() cursor6.close() if len(postContent[0])>10: print '企业细节记录已存在,跳过' return True print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id'] try: contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic) except: print '页面未包含详细记录或获取失败' jTool.logError('\nPost get None, ename:'+param['ename'].strip()+',id:'+param['id']+',url:'+ param['url']) return True try: contentRecord = cutContent(contentRecord) print 'Get post content OK' jTool.logit(str(contentRecord), 'errorContent.log') if contentRecord: jTool.updateData(cursor5, ' where eid = '+param['id'], 'enterprise_raw', {'postContent': str(contentRecord).decode('utf-8', 'ignore')}) cursor5.close() else: print 'POST得到页面非企业详细记录页面' jTool.logError('\nPage content error, method:post, ename:'+param['ename']+', id:'+param['id']+',url:'+ param['url']+', proxy:'+proxy) return True except: print '解析页面详细记录失败' jTool.logError('\nParse page content fail, method:post, ename:'+param['ename']+', id:'+param['id']+',url:'+ param['url']+', proxy:'+proxy) return False print 'Fetch and insert successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) return True
def getDetailListPages(pageNo, conn): ''' 最终目标页面的列表项 每个列表项打开后是若干url列表,每个url指向的页面是抓取目标 ''' # requestUrl = 'http://218.108.28.28:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False' requestUrl = 'http://218.108.28.28:8000/ListPrompts.aspx' head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding:gzip,deflate,sdch', 'Accept-Language:zh-CN,zh;q=0.8', 'Cache-Control:max-age=0', 'Connection:keep-alive', 'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81', 'Host:www.zjcredit.gov.cn:8000', 'Pragma:no-cache', 'Origin:http://www.zjcredit.gov.cn:8000', 'Referer:http://218.108.28.28:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False', 'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36' ] # associateID = '00000000000000000' # field_CorporationName = '' # field_OrganizationCode = '' # isIntermediary = 'False' # pageLength = 20 # recordTotal = 46358 # sectionID = '01' # tableID = 'CourtNotCarryOut' post_data_dic = {'recordTotal': 46358, 'tableID': 'CourtNotCarryOut', 'associateID': '00000000000000000', 'field_CorporationName': '', 'sectionID': '01', 'field_OrganizationCode': '', 'isIntermediary': 'False', 'pageNo': pageNo, 'pageLength': 20} proxy = '218.108.170.173:82' proxy = '218.108.170.170:80' content = jTool.fetchUrlProxy2(proxy, requestUrl, post_data_dic, 'post', head) # jTool.logit(content, '22.html') # content = open('22.html').read() if content: exp = '//table[1]/tr[2]/td[1]/table[1]/tr/td/a' aList = jTool.getNodes(content, exp) # import lxml.etree as ETree # print aList[1].xpath('//@title')[0] titles = aList[1].xpath('//@title') corpRecords = {} # print aList[1].xpath('//@onclick')[20] aStr = aList[1].xpath('//@onclick') cursor = conn.cursor() from urllib import unquote for i in range(20): # corpRecords['`corpName`'] = (unquote(titles[i])).decode('utf-8') corpRecords['`corpName`'] = titles[i].decode('utf-8').encode('utf-8') # print corpRecords['`corpName`'] # print unquote(corpRecords['`corpName`']) # print aStr[i+1] tmp = aStr[i+1].split(',') # print i+1 corpRecords['`table`'] = tmp[0].split("'")[1] corpRecords['`rowID`'] = tmp[5].split("'")[1] # print corpRecords['rowID'] # print corpRecords['corpName'] corpRecords['`pageNo`'] = str(pageNo) result = jTool.insertData(cursor, 'base_page_list', corpRecords) conn.commit() cursor.close() conn.close()