def __download_resume(self, id): urls = 'http://www.fenjianli.com/search/getDetail.htm' _timeout = 30 import base64 ids = [] encode_id = base64.b64encode(id) ids.append(encode_id) kw = self.params.get("keywords", None) if kw: Referer = 'http://www.fenjianli.com/search/detail.htm?ids=' + ids[0] + '&kw=%s' % kw else: Referer = 'http://www.fenjianli.com/search/detail.htm?ids=' + ids[0] self.headers["Referer"] = Referer logger.info('proxies %s of Referer %s' % (self.proxies, Referer)) resume_param = { "id": id, "_random": random.uniform(0, 1) } logger.info('headers %s of download resume' % (self.headers)) try_times = 0 operation_times = 0 login_times = 0 time.sleep(random.uniform(10, 60)) while True: while True: try_times += 1 try: logger.warning('fetching params %s with %s' % (resume_param, self.proxies)) response = self.session.post(urls, data=resume_param, headers=self.headers, timeout=_timeout, proxies=self.proxies) assert response.status_code == 200 response.encoding = 'utf-8' break except Exception: logger.warning( 'fetch params %s with %s fail:\n%s' % (resume_param, self.proxies, traceback.format_exc())) if try_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(30) if u'data-toggle="modal">登录</a>' in response.text and u'<h4 class="modal-title">用户登录</h4>' in response.text: self.session = contact.login(username, password, proxies=self.proxies) login_times += 1 if login_times > 5: raise Exception("LOGIN_ACCOUNT_ERROR!") continue if u"非法操作" in response.text: time.sleep(random.uniform(300, 600)) operation_times += 1 if operation_times > 5: raise Exception("ILLEGAL_OPERATION!") continue if "is-single-list" in response.text: raise Exception("IS_SINGLE_LIST!") return response.text
def lagou_search(params, dedup, proxies=None): assert username, password user_agent = nautil.user_agent() session = contact.login(username, password, user_agent, proxies) if __check_params(params): param = __splice_search_urls(session, user_agent, params, proxies=proxies) return spider(session, param, user_agent, dedup, proxies=proxies) else: return []
def fjl_search(params, dedup, proxies=None): assert username, password session = contact.login(username, password, proxies=proxies) url = 'http://www.fenjianli.com/search/search.htm' __params = __splice_search_urls(params) get_resume = getResume(session, url, __params, dedup, proxies=proxies) if "scheme_flag" in params: if "scheme" not in params and "scheme_index" not in params: logger.info("params中包含shceme_flag, 但是没有scheme和scheme_index......") resume = get_resume.goto_resume_urls_without_scheme() return resume else: logger.info("params中包含shceme_flag, 且包含scheme和scheme_index.....") resume = get_resume.goto_resume_urls_with_scheme(params) return resume else: logger.info("params中没有shceme_flag.....") resume = get_resume.goto_resume_urls() return resume
def __search(self, param): try_times = 0 time.sleep(random.uniform(10, 60)) for connect_times in xrange(0, 5): while True: try_times += 1 try: logger.warning('fetching %s with %s data:\n%s' % (self.url, self.proxies, param)) response = self.session.post(self.url, data=param, headers=self.headers, timeout=30, proxies=self.proxies) assert response assert response.status_code == 200 except Exception: logger.warning('fetch %s with %s fail:\n%s' % (self.url, self.proxies, traceback.format_exc())) if try_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(30) else: break if u"非法操作" in response.text: time.sleep(60) continue if u'data-toggle="modal">登录</a>' in response.text and u'<h4 class="modal-title">用户登录</h4>' in response.text: self.session = contact.login(username, password, proxies=self.proxies) time.sleep(30) continue if "totalSize" not in response.text: logger.warning("response with param %s , response_data: \n%s, without totalSize: \n%s" % (param, response.text, traceback.format_exc())) time.sleep(random.uniform(300, 600)) continue if u"""'list': None""" in response.text: logger.error( "response \n%s with params \n%s error \n%s" % (response.text, param, traceback.format_exc())) time.sleep(random.uniform(300, 600)) continue try: response_results = json.loads(response.text, encoding='utf-8') except Exception: logger.error('json parse fail:\n%s\n%s' % (response.text, traceback.format_exc())) time.sleep(30) continue return response_results
def x58_search(params, dedup, proxies=None): assert username urls = get_resume_list_urls(params) user_agent = nautil.user_agent() s = contact.login(username, password, user_agent, proxies=proxies) return get_resume_urls(s, urls, user_agent, dedup, proxies=proxies)