def readweb(): print u'get jobdict.....' configmap = {} jobnamesfile = 'dict/jobnames.pkl' if os.path.isfile(jobnamesfile): configmap = pickle.load(open(jobnamesfile)) return configmap p = Proxy() while (True): proxies = p.getproxies() try: r = requests.get(url='http://www.lagou.com/', proxies=proxies, timeout=60) break except Exception, e: p.nextip() logging.debug(str(e))
def scrapy(jobname): # print 'crawling ' + jobname + '.....' p = Proxy() db = MySQLdb.connect(dbadd, user, password, database, use_unicode=True, charset="utf8") cursor = db.cursor() req_url = 'http://www.lagou.com/jobs/positionAjax.json?' headers = {'content-type': 'application/json;charset=UTF-8'} while (True): proxies = p.getproxies() try: req = requests.post(req_url, params={'first': 'false', 'pn': 1, 'kd': jobname}, headers=headers, timeout=60, proxies=proxies, allow_redirects=False) totalCount = req.json()['content']['positionResult']['totalCount'] pageSize = req.json()['content']['positionResult']['pageSize'] maxpagenum = totalCount / pageSize break except Exception, e: p.nextip() logging.debug(str(e))