def doTask(task, param): proxyList = jTool.getProxy("proxy.txt") pcount = len(proxyList) - 1 param["id"] = str(task[0]) # param['url'] = param['preUrlip'] + task[1] param["url"] = task[1] param["ename"] = task[2] cursor = param["conn"].cursor() proxy = str(proxyList[random.randint(0, pcount)]).strip() # try: result = rp.crawlGetUrl(param["conn"], param, proxy) count = 1 while not result["logic"] and count <= 2: proxy = str(proxyList[random.randint(0, pcount)]).strip() result = rp.crawlGetUrl(param["conn"], param, proxy) count += 1 if result["logic"]: completeTask(param, task[3]) if not result["logic"]: print "error record:" + str(param["id"]) # except Exception, e: # print 'doTask', __name__, e # return True param["conn"].commit() cursor.close()
def mainLoop(num, start, end): param = makeParam() param['taskTable'] = 'item_url_task' param['num'] = str(num) count = 0 querySql = 'select id, url, enterprise_name from '+param['taskTable']+' where id >= '+str(start)+' and id <= '+ str(end) cursor = param['conn'].cursor() cursor.execute(querySql) proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 proxy = proxyList[random.randint(0, pcount)] while start<end: record = cursor.fetchone() task = list(record) print 'Get new task on '+ str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) result = doTask(task, param, proxy) count += 1 print count ids = '' if result: completeTask(param, task[0]) start += 1 continue if not result: rollbackTask(param, task[0]) start += 1 continue cursor.close() param['conn'].close()
def doTask(task, param): proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 param['id'] = str(task[0]) param['url'] = param['preUrlip'] + task[1] param['ename'] = task[2] param['iid'] = task[3] cursor = param['conn'].cursor() proxy = str(proxyList[random.randint(0, pcount)]).strip() # try: result = rp.crawlPostUrl(param['conn'], param, proxy) count = 1 while not result['logic'] and count<=2: proxy = str(proxyList[random.randint(0, pcount)]).strip() result = rp.crawlPostUrl(param['conn'], param, proxy) count += 1 if not result['logic']: result['rtData']['eid'] = param['id'] result['rtData']['url'] = param['url'] result['rtData']['ename'] = param['ename'] result['rtData']['proxy'] = proxy.strip() # jTool.insertDatai(cursor, 'error_log_p', result['rtData']) print 'error record:'+ str(param['id']) # except Exception, e: # print 'doTask', __name__, e # return True param['conn'].commit() cursor.close()
def doTask(eid, url, ename, param): ''' 循环6次尝试获取指定url的内容 然后存储到远程数据库 返回True,否则False ''' param['eid'] = str(eid) param['ename'] = ename # param['url'] = param['preUrl'] + url param['url'] = param['preUrlip'] + url proxyList = jTool.getProxy('proxy.txt') return rp.crawlUrl(param['conn'], param, proxyList)
def mainLoop(start, end): proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding:gzip,deflate,sdch', 'Accept-Language:zh-CN,zh;q=0.8', 'Cache-Control:max-age=0', 'Connection:keep-alive', 'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81', 'Host:www.zjcredit.gov.cn:8000', 'Pragma:no-cache', 'Cookie:_gscu_374314293=73631708ff8h1y17; lzstat_uv=106813037832225946|2529639; ECStaticSession=ECS80; ASP.NET_SessionId=5dhxxl45gr4d0aexnf1uiu55; _gscbrs_374314293=1; lzstat_ss=815622537_1_1374448570_2529639; _gscs_374314293=t74419759zee6a318|pv:2', 'Origin:http://www.zjcredit.gov.cn:8000', 'Referer:http://www.zjcredit.gov.cn:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False', 'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36' ] conn = jTool.initCursor('localhost', 'root', 'root', 'rawData') cursor = conn.cursor() cursor2 = conn.cursor() while start<=end: sql = 'select * from base_page_list where id = '+str(start)+' and status = 0 limit 1' cursor.execute(sql) record = cursor.fetchone() if not record and start<=end: start += 1 continue corpName = record[1] rowID = record[3] print corpName+', '+str(start) rt = None count = 1 while not rt and count<=2: # print count proxy = str(proxyList[random.randint(0, pcount)]).strip() # print proxy rt = getPageField(conn, proxy, head, str(start), rowID, corpName) # print rt count += 1 if rt: print 'id'+str(start)+' ok' jTool.updateData(cursor2, ' where id = '+str(start)+' ', 'base_page_list', {'status': '1'}) continue start += 1 conn.commit() cursor.close() cursor2.close() conn.close()
def operLog(logFileName): file = open(logFileName) line = file.readline() param = makeParam() param['taskTable'] = 'item_url_task' param['num'] = '999' proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 proxy = proxyList[random.randint(0, pcount)] cursor = param['conn'].cursor() cursor2 = param['conn'].cursor() cursor3 = param['conn'].cursor() count = 0 while line: tmp = line.split(',') if len(tmp)>2: for t in tmp: tt = t.split(':') if len(tt)>1: param[tt[0].strip()] = tt[1] if len(tt)>3: param[tt[0]] = tt[1]+':'+tt[2]+':'+tt[3].strip('\n') for i in range(10): ext = jTool.exsitsRecord(cursor, 'enterprise_raw_'+str(i), 'eid', param['id']) i += 1 if not ext: jTool.insertDatai(cursor, 'error_log', {'eid': param['id'], 'url': param['url'], 'ename': param['ename']}) print str(param['id']) else: try: if ext!='error': jTool.getField(cursor2, 'enterprise_raw_'+str(i), 'postContent', ' where eid = '+str(param['id'])) val = cursor2.fetchone() if not val[0]: jTool.insertDatai(cursor3, 'error_log', {'eid': param['id'], 'url': param['url'], 'ename': param['ename']}) print str(param['id']) except: pass param['conn'].commit() line = file.readline() count += 1 cursor.close() cursor2.close() cursor3.close() param['conn'].close() print 'line error :'+str(count)
def operLog(logFileName): file = open(logFileName) line = file.readline() param = makeParam() param['taskTable'] = 'item_url_task' param['num'] = '999' proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 proxy = proxyList[random.randint(0, pcount)] while line: if line[0]=='G': tmp = line.split(',') for t in tmp: tt = t.split(':') if len(tt)>1: param[tt[0].strip()] = tt[1] if len(tt)>2: param[tt[0]] = tt[1]+':'+tt[2]+':'+tt[3].strip('\n') param['id'] = param['eid'] rp.crawlUrl(param['conn'], param, proxy) if line[0]=='P': # print line pass line = file.readline()
def makeParam(): paramDic = {} paramDic['station'] = '信用浙江' paramDic['begin_url'] = 'http://www.zjcredit.gov.cn:8000/CreditQuery.aspx?sectionID=02' paramDic['query_url'] = 'http://www.zjcredit.gov.cn:8000/ListQuery.aspx' paramDic['post_data_dic'] = {'isIntermediary': 'False', 'isOpen': 'False', 'pageLength': '20', 'recordTotal': '1778190', 'sectionID': '02', 'sortDirection': '1', 'sortField': 'CreditID'} paramDic['preUrl'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID=' paramDic['preUrlip'] = 'http://218.108.28.28:8000/EnterpriseInfo.aspx?creditID=' paramDic['basePostUrl'] = 'http://www.zjcredit.gov.cn:8000/GetInfoByDataSupplier.aspx' paramDic['basePostUrlip'] = 'http://218.108.28.28:8000/GetInfoByDataSupplier.aspx' paramDic['dbHost'] = 'localhost' paramDic['dbUser'] = '******' paramDic['dbPasswd'] = 'root' paramDic['rdb'] = 'rawData' conn = jTool.initCursor(paramDic['dbHost'], paramDic['dbUser'], paramDic['dbPasswd'], paramDic['rdb']) paramDic['conn'] = conn return paramDic param = makeParam() param['taskTable'] = 'item_url_task' param['num'] = '999' proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 proxy = proxyList[random.randint(0, pcount)] param['url'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID=F651B7F17FEEDA7A' param['eid'] = '69999' param['id'] = '69999' param['ename'] = '湖州市邮政局千金邮电所' rp.crawlUrl(param['conn'], param, proxy)