def getInfo(url, para): """ 获取信息 """ generalHttp = Http() htmlCode = generalHttp.post(url, para=para, headers=headers, cookies=cookies) generalParse = Parse(htmlCode) pageCount = generalParse.parsePage() info = [] for i in range(1, pageCount + 1): print('第%s页' % i) para['pn'] = str(i) htmlCode = generalHttp.post(url, para=para, headers=headers, cookies=cookies) generalParse = Parse(htmlCode) info = info + getInfoDetail(generalParse) time.sleep(5) if i == 10: return info return info
def getInfo(url, para): """ 获取信息 """ generalHttps = Https() htmlCode = generalHttps.post(url, para=para, headers=hd) #print htmlCode generalParse = Parse(htmlCode) pageCount = generalParse.parsePage() info = [] for i in range(1, pageCount + 1): try: print('第%s页开始爬取' % i) para['pn'] = str(i) htmlCode = generalHttps.post(url, para=para, headers=hd) generalParse = Parse(htmlCode) info = getInfoDetail(generalParse) if info: flag = processInfo(info, para) if flag is None: print('存储异常') #return None print('第%s页存储完成' % i) else: print('第%s页内容为空,不存储' % i) except Exception, e: logging.error('Process except') print 'str(Exception):\t', str(Exception) print 'str(e):\t\t', str(e) print 'repr(e):\t', repr(e) print 'e.message:\t', e.message time.sleep(2)
def getInfo(url, data): """ 获取信息 """ htmlCode = requests.post(url, data=data, headers=setting.headers) generalParse = Parse(htmlCode) pageCount = generalParse.parsePage() info = [] for i in range(1, pageCount + 1): print("第%s页" % i) data['pn'] = str(i) htmlCode = requests.post(url, data=data, headers=setting.headers) generalParse = Parse(htmlCode) info = info + getInfoDetail(generalParse) time.sleep(2) return info
def getInfo(url, para): """ 获取信息 """ res = init() res = requests.post('https://www.lagou.com/jobs/positionAjax.json', headers=res['headers'], params=res['params'], cookies=res['cookies'], data=res['data']) # 请求接口 htmlCode = res.text generalParse = Parse(htmlCode) pageCount = generalParse.parsePage() print('pageCount',pageCount) info = [] for i in range(1, pageCount + 1): # for i in range(1,2): print('第%s页' % i) para['pn'] = str(i) res = init() res = requests.post('https://www.lagou.com/jobs/positionAjax.json', headers=res['headers'], params=res['params'],cookies=res['cookies'], data=res['data']) # 请求接口 htmlCode = res.text generalParse = Parse(htmlCode) info = info + getInfoDetail(generalParse) time.sleep(3) return info