def readweb(): print u'get jobdict.....' configmap = {} jobnamesfile = 'dict/jobnames.pkl' if os.path.isfile(jobnamesfile): configmap = pickle.load(open(jobnamesfile)) return configmap p = Proxy() while (True): proxies = p.getproxies() try: r = requests.get(url='http://www.lagou.com/', proxies=proxies, timeout=60) break except Exception, e: p.nextip() logging.debug(str(e))
def get_company_description(fetchallist): db = MySQLdb.connect(dbadd, user, password, database, use_unicode=True, charset="utf8") cursor = db.cursor() p = Proxy() for id in fetchallist: while (True): try: values = get_company_info_byid(id[0], p) values.append(id[0]) cursor.execute( 'update company set companyUrl = %s,description = %s,fullName = %s,shortName = %s,detailPosition = %s,industryField = %s,companySize = %s,city = %s,financeStage = %s,profile = %s where companyId = %s', values) db.commit() print u"update:", id[0] break except Exception, e: logging.debug(str(e)) p.nextip()
def scrapy(jobname): # print 'crawling ' + jobname + '.....' p = Proxy() db = MySQLdb.connect(dbadd, user, password, database, use_unicode=True, charset="utf8") cursor = db.cursor() req_url = 'http://www.lagou.com/jobs/positionAjax.json?' headers = {'content-type': 'application/json;charset=UTF-8'} while (True): proxies = p.getproxies() try: req = requests.post(req_url, params={'first': 'false', 'pn': 1, 'kd': jobname}, headers=headers, timeout=60, proxies=proxies, allow_redirects=False) totalCount = req.json()['content']['positionResult']['totalCount'] pageSize = req.json()['content']['positionResult']['pageSize'] maxpagenum = totalCount / pageSize break except Exception, e: p.nextip() logging.debug(str(e))