def doTask(eid, url, ename, param): ''' 循环6次尝试获取指定url的内容 然后存储到远程数据库 返回True,否则False ''' param['eid'] = str(eid) param['ename'] = ename # param['url'] = param['preUrl'] + url param['url'] = param['preUrlip'] + url proxyList = jTool.getProxy('proxy.txt') return rp.crawlUrl(param['conn'], param, proxyList)
def operLog(logFileName): file = open(logFileName) line = file.readline() param = makeParam() param['taskTable'] = 'item_url_task' param['num'] = '999' proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 proxy = proxyList[random.randint(0, pcount)] while line: if line[0]=='G': tmp = line.split(',') for t in tmp: tt = t.split(':') if len(tt)>1: param[tt[0].strip()] = tt[1] if len(tt)>2: param[tt[0]] = tt[1]+':'+tt[2]+':'+tt[3].strip('\n') param['id'] = param['eid'] rp.crawlUrl(param['conn'], param, proxy) if line[0]=='P': # print line pass line = file.readline()
def doTask(task, param, proxy): param['id'] = str(task[0]) param['url'] = param['preUrlip'] + task[1] param['ename'] = task[2] # try: return rp.crawlUrl(param['conn'], param, proxy)
def makeParam(): paramDic = {} paramDic['station'] = '信用浙江' paramDic['begin_url'] = 'http://www.zjcredit.gov.cn:8000/CreditQuery.aspx?sectionID=02' paramDic['query_url'] = 'http://www.zjcredit.gov.cn:8000/ListQuery.aspx' paramDic['post_data_dic'] = {'isIntermediary': 'False', 'isOpen': 'False', 'pageLength': '20', 'recordTotal': '1778190', 'sectionID': '02', 'sortDirection': '1', 'sortField': 'CreditID'} paramDic['preUrl'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID=' paramDic['preUrlip'] = 'http://218.108.28.28:8000/EnterpriseInfo.aspx?creditID=' paramDic['basePostUrl'] = 'http://www.zjcredit.gov.cn:8000/GetInfoByDataSupplier.aspx' paramDic['basePostUrlip'] = 'http://218.108.28.28:8000/GetInfoByDataSupplier.aspx' paramDic['dbHost'] = 'localhost' paramDic['dbUser'] = '******' paramDic['dbPasswd'] = 'root' paramDic['rdb'] = 'rawData' conn = jTool.initCursor(paramDic['dbHost'], paramDic['dbUser'], paramDic['dbPasswd'], paramDic['rdb']) paramDic['conn'] = conn return paramDic param = makeParam() param['taskTable'] = 'item_url_task' param['num'] = '999' proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 proxy = proxyList[random.randint(0, pcount)] param['url'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID=F651B7F17FEEDA7A' param['eid'] = '69999' param['id'] = '69999' param['ename'] = '湖州市邮政局千金邮电所' rp.crawlUrl(param['conn'], param, proxy)