def zhuopin_search(params, dedup, proxies=None): assert username, password user_agent = nautil.user_agent() try: return fetch_resume_impl(params, username, password, user_agent, dedup, proxies=proxies) except Exception, e: logger.warning("fetch resume failed with \n%s" % e)
def j51_search(narenkeywords, dedup=None, proxies=None): assert username, password session = requests.Session() user_agent = nautil.user_agent() params = __splice_search_urls(narenkeywords) connection_times = 0 while True: connection_times += 1 try: user = QianChenUser(ctmname, username, password, proxies=proxies, logging=logger) user.login() _session = user.session.requests _session_cookies_HRUSERINFO = _session.cookies.get("HRUSERINFO") _session_cookies_AccessKey = _session.cookies.get( "AccessKey", domain="ehirelogin.51job.com") session.cookies.set("HRUSERINFO", _session_cookies_HRUSERINFO) session.cookies.set("AccessKey", _session_cookies_AccessKey) except Exception: if connection_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(random.uniform(10, 30)) continue else: break return __get_resume_page(session, user_agent, params, dedup=dedup, proxies=proxies)
def fetch_contact(search_data, resume_id, username, password, proxies=None): logger.info("start fetch contact with search_data: %s, \nresume_id: %s" % (search_data, resume_id)) user_agent = nautil.user_agent() logger.info("登录中......") is_login, session = Login(username, password, user_agent, proxies=proxies).login() if not is_login: return {"err_code": 101, "err_msg": session} ids = [] flag_7002 = False for resume in GetResume(session, resume_id, search_data, user_agent, proxies=proxies).get_resume_by_keywords(): # print resume["err_msg"] upload_resume = json.dumps(resume["err_msg"], ensure_ascii=False) if resume["err_code"] == 7002: # print json.dumps(resume["err_msg"], ensure_ascii=False) res = upload(upload_resume, "yifeng", get_contact=True, logger_in=logger) ids.append(res["resume_id"]) flag_7002 = True elif resume["err_code"] == 0: # print json.dumps(resume["err_msg"], ensure_ascii=False) return upload(upload_resume, "yifeng", get_contact=True, logger_in=logger) else: return resume if flag_7002: resume_ids = " ".join(ids) return {"err_code": 7002, "err_msg": "找到了%s个简历, ids: %s" %(len(ids), resume_ids)} if False: resume = GetResume(session, resume_id, search_data, user_agent, proxies=proxies).get_resume_by_id()
def fetch_contact(search_data, resume_id, username, password, proxies=None): logger.info("start fetch contact with search_data: %s, \nresume_id: %s" % (search_data, resume_id)) user_agent = nautil.user_agent() return fetch_contact_impl(search_data, resume_id, username, password, user_agent, proxies=proxies)
def lagou_search(params, dedup, proxies=None): assert username, password user_agent = nautil.user_agent() session = contact.login(username, password, user_agent, proxies) if __check_params(params): param = __splice_search_urls(session, user_agent, params, proxies=proxies) return spider(session, param, user_agent, dedup, proxies=proxies) else: return []
def login(username, password, proxies=None): if not os.path.exists('cookies'): os.mkdir('cookies') if not os.path.exists('cookies/fjl_cookies'): os.mkdir('cookies/fjl_cookies') session = requests.Session() cookie_file_name = 'cookies/fjl_cookies/%s' % username if os.path.exists(cookie_file_name): with open(cookie_file_name, 'r') as cookie_file: session.cookies.update(json.load(cookie_file)) url = "http://www.fenjianli.com/" headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN, zh;q = 0.8", "Connection": "keep-alive", "Host": "www.fenjianli.com", "User-Agent": nautil.user_agent(), } _timeout = 30 time.sleep(random.uniform(3, 10)) try_times = 0 while True: try_times += 1 try: logger.warning('fetching url %s with %s' % (url, proxies)) response = session.get(url, headers=headers, timeout=_timeout, proxies=proxies) assert response.status_code == 200 response.encoding = 'utf-8' except Exception: logger.warning('fetching url %s headers %s with %s fail: \n%s' % (url, headers, proxies, traceback.format_exc())) if try_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(30) else: break if "智联模式" in response.text and '<a href="/login/logout.htm">退出</a></li>' in response.text: return session else: login_session = __login(username, password, proxies=proxies) with open(cookie_file_name, 'w') as cookie_file: _cookies = {} for k, v in login_session.cookies.iteritems(): _cookies[k] = v json.dump(_cookies, cookie_file) return login_session
def fetch_contact(resume_id, user_name, user_password, logger_name=None, other_fields=None): user_agent = nautil.user_agent() # try: s = login(user_name, user_password, user_agent=user_agent) s.temp_folder = os.path.join(tempfile.gettempdir(), "x58", str(random.randint(1, 10000))) if not os.path.isdir(s.temp_folder): os.makedirs(s.temp_folder) contact = __fetch_contact(s, resume_id, user_name, user_password) return contact
def register(self): int_samples = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9') char_samples = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') register_password = random.choice(char_samples) for i in xrange(random.randint(3, 6)): register_password += random.choice(int_samples + char_samples) for i in xrange(random.randint(2, 5)): register_password += random.choice(int_samples) logger.info('random passwd %s' % register_password) try_times = 0 while True: session = requests.Session() session.headers.update({ "User-Agent": nautil.user_agent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': 1 }) session.get('http://www.818cv.com/', headers={ "Host": "www.818cv.com", }, proxies=self.proxies) response = session.post('http://www.818cv.com/user/reg/', headers={ "Host": "www.818cv.com", "Origin": "http://www.818cv.com", "Referer": "http://www.818cv.com/", }, data={ "useremail": self.username, "password": register_password, "repassword": register_password, }, proxies=self.proxies) response.encoding = 'utf-8' assert response.status_code == 200 assert '''<li class="useremail"><a title='账户信息''' in response.text, 'unexpected response %s' % response.text break return register_password
def __fetch_contact(session, resume_id, proxies): assert isinstance(resume_id, (str, unicode)) encrypt_resume_id = base64.b64encode(str(int(resume_id))) user_agent = nautil.user_agent() search_headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "User-Agent": user_agent, "Host": "www.fenjianli.com", "Origin": "http://www.fenjianli.com", "Connection": "keep-alive", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "Referer": "http://www.fenjianli.com/search/detail.htm?ids=%s" % encrypt_resume_id, "X-Requested-With": "XMLHttpRequest" } logger.info( 'fetching resume detail >> http://www.fenjianli.com/search/detail.htm?ids=%s' % encrypt_resume_id) r = session.post('http://www.fenjianli.com/search/getDetail.htm', headers=search_headers, proxies=proxies, data={ 'id': resume_id, '_random': random.random() }) assert r.status_code == 200, r.status_code data = json.loads(r.text) assert 'originalFilePath' in data logger.info('fetching path %s' % data['originalFilePath']) raw_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', "User-Agent": user_agent, # "Host": "demo.fenjianli.com:9344", 'Upgrade-Insecure-Requests': '1', } r = session.get(data['originalFilePath'], headers=raw_headers, proxies=proxies) assert r.status_code == 200, '%s\n%s' % (r.status_code, r.content) return upload(r.content, 'fjl', get_contact=True, fjl_id=resume_id)
def __init__(self, session, url, params, dedup, proxies=None): self.session = session self.params = params self.url = url self.dedup = dedup self.proxies = proxies self.headers = { "User-Agent": nautil.user_agent(), "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "X-Requested-With": "XMLHttpRequest", "Host": "www.fenjianli.com", "Connection": "keep-alive", "Referer": "http://www.fenjianli.com", "Origin": "http://www.fenjianli.com", }
def __init__(self, session, url, params, dedup=None, proxies=None): self.session = session self.url = url self.params = params self.dedup = dedup self.proxies = proxies self.flag = 0 self.headers = { "User-Agent": nautil.user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Host": "www.818cv.com", "Connection": "keep-alive", "Referer": "http://www.818cv.com/resume/search/", }
def __login(username, password, proxies=None): url = "http://www.fenjianli.com/login/login.htm" headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN, zh;q = 0.8", "Connection": "keep-alive", "Host": "www.fenjianli.com", "User-Agent": nautil.user_agent(), "Referer": "http://www.fenjianli.com/login/home.htm", "X-Requested-With": "XMLHttpRequest" } params = {"username": username, "password": password, "rememberMe": "1"} _timeout = 30 session = requests.Session() try_times = 0 while True: try: try_times += 1 logger.warning('fetching %s with %s' % (url, proxies)) response = session.post(url, data=params, headers=headers, timeout=_timeout, proxies=proxies) assert response.text assert response.status_code == 200 response.encoding = 'utf-8' except Exception: logger.warning('fetching url %s headers %s with %s fail:\n%s' % (url, headers, proxies, traceback.format_exc())) if try_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(30) else: break if u"账号或密码错误" in response.text: logger.warning("LOGIN WITH username=%s, passwoword=%s WRONG" % (username, password)) raise Exception("ACCOUNT_ERROR!") return session
def parse_check_code(session, url, source, proxies, headers={'User-Agent': nautil.user_agent()}, typeid=3040): post_data = {'typeid': typeid, 'source': source} response = session.get(url, headers=headers, proxies=proxies) assert response.status_code == 200 image_file = [("image", ('image', response.content, 'image/png'))] response = requests.post("http://www.xnaren.com:9100/util/checkcode", timeout=None, data=post_data, files=image_file, verify=False) assert response.status_code == 200 result = response.json() assert result['err_code'] == 0 logger.info("code result %s from %s" % (result['result']['code'], url)) return result['result']['code']
def x58_search(params, dedup, proxies=None): assert username urls = get_resume_list_urls(params) user_agent = nautil.user_agent() s = contact.login(username, password, user_agent, proxies=proxies) return get_resume_urls(s, urls, user_agent, dedup, proxies=proxies)
raise Exception("PROXY_FAIL!") else: time.sleep(random.uniform(10, 30)) continue else: break return __get_resume_page(session, user_agent, params, dedup=dedup, proxies=proxies) if __name__ == '__main__': session = requests.Session() user_agent = nautil.user_agent() # __cookie = u"""<Cookie 51job=cenglish%3D0 for .51job.com/>, <Cookie guid=14708204822314850010 for .51job.com/>, <Cookie 51job= for ehire.51job.com/>, <Cookie ASP.NET_SessionId=5bwfnvkti3yqcawya1qpanip for ehire.51job.com/>, <Cookie AccessKey=f0fbc86c9a67433 for ehire.51job.com/>, <Cookie EhireGuid=1a66fd0c1fa54cd7aa8758102c9aba26 for ehire.51job.com/>, <Cookie LangType=Lang=&Flag=1 for ehire.51job.com/>, <Cookie Theme=Default for ehire.51job.com/>, <Cookie guid= for ehire.51job.com/>, <Cookie ASP.NET_SessionId=kesqo11d3vmqsnrrjl20fjbb for ehirelogin.51job.com/>, <Cookie AccessKey=6efe43b522ed4e5 for ehirelogin.51job.com/>, <Cookie HRUSERINFO=CtmID=2018813&DBID=2&MType=02&HRUID=2362118&UserAUTHORITY=1100111011&IsCtmLevle=1&UserName=xpsh959&IsStandard=0&LoginTime=08%2f10%2f2016+17%3a14%3a36&ExpireTime=08%2f10%2f2016+17%3a24%3a36&CtmAuthen=0000011000000001000110010000000011100001&BIsAgreed=true&IsResetPwd=true&CtmLiscense=1&AccessKey=b5492dce296abd74 for ehirelogin.51job.com/>, <Cookie LangType=Lang=&Flag=1 for ehirelogin.51job.com/>, <Cookie Theme=Default for ehirelogin.51job.com/>""" # __get_viewstate(session) p = { "destitle": { "010130084": "软件工程师" }, "education": "本科", "low_workage": "1", "sex": "只选男", "desworklocation": { "35": "北京市-北京市" }, "lastupdatetime": "最近7天", # "resumekeywords": ["PHP"] }
return session else: login_cookies = __login(username, password, user_agent, proxies=proxies) assert isinstance(login_cookies, list) login_cookie_jar = requests.cookies.RequestsCookieJar() for login_cookie in login_cookies: login_cookie_jar.set(login_cookie['name'], login_cookie['value'], domain=login_cookie['domain'], path=login_cookie['path']) with open(cookie_file_name, 'w') as cookie_file: cookie_file.write(str(login_cookies)) session.cookies.update(login_cookie_jar) logger.info("获取session成功!") return session if __name__ == '__main__': logger.setLevel(logging.INFO) logger.addHandler(logging.StreamHandler()) ip_port = '31.200.229.69:8080' ip_port = '45.76.159.179:8080' proxies = {'http': 'http://%s' % ip_port, 'https': 'https://%s' % ip_port} proxies = None __login('*****@*****.**', 'naren0925x', nautil.user_agent(), proxies=proxies) # from naren_browser.blind_browser import browse # browse('https://passport.lagou.com/login/login.html', [], url_js={}, visible=True, html_only=False, timeout=6000, proxy=proxies)
def __fetch_contact(session, resume_id, user_name, user_password, proxies=None): user_agent = nautil.user_agent() proxies = None def __session(method, url, headers={}, data=None): logger.info('------\nRequesting %s On %s With Data:\n%s\n------' % (method, url, data)) # time.sleep(random.uniform(4, 15)) time.sleep(random.uniform(1, 2)) assert method in ('get', 'post') request_headers = { "User-Agent": user_agent, } for k, v in headers.iteritems(): request_headers[k] = v if method == 'get': response = session.get(url, headers=request_headers, proxies=proxies, params=data) if method == 'post': response = session.post(url, headers=request_headers, proxies=proxies, data=data) assert response assert response.status_code == 200 response.encoding = 'utf-8' return response.text main_page = __session('get', 'http://www.ganji.com/vip') if '赶集用户登录' in main_page: logger.info('cookie fail, try login') logger.info('re-login') hash_value = re.search('''window.PAGE_CONFIG.__hash__ = '([^']*)';''', main_page) assert hash_value hash_value = hash_value.group(1) logger.info('login hash_value:%s' % hash_value) check_code_url = re.search( '''<img[^>]*id="login_img_checkcode"[^>]*src=['"]*([^'"]*)['"]*[^>]*>''', main_page) assert check_code_url check_code_url = check_code_url.group(1) logger.info('login check_code_url:%s' % check_code_url) time_stamp = str(int(time.time() * 1000)) counter = 0 while True: counter += 1 if counter > 10: raise Exception('try too many times to login') login_result = __session( 'get', 'https://passport.ganji.com/login.php', headers={ "Host": "passport.ganji.com", "Referer": "https://passport.ganji.com/login.php?next=/", "X-Requested-With": "XMLHttpRequest", "Connection": "keep-alive" }, data={ "callback": "jQuery1820229177205394230_%s" % time_stamp, "username": user_name, "password": user_password, "checkCode": parse_check_code(session, check_code_url, 'ganji', proxies), "setcookies": "14", "second": "", "parentfunc": "", "redirect_in_iframe": "", "next": '/', "__hash__": hash_value, "_": time_stamp }) open(session.temp_folder + os.path.sep + 'login_result.html', 'w').write(login_result) if 'error_msg' in login_result: logger.warning('login fail with response:\n%s' % login_result) else: break logger.info('trying to buy contact') message = __session('get', 'http://www.ganji.com/findjob/download_resume.php', headers={ "Host": "www.ganji.com", "Referer": "http://www.ganji.com/jianli/%sx.htm" % resume_id, "Upgrade-Insecure-Requests": 1, }, data={ "source": "detail", "resume_type": "0", "findjob_puid": resume_id, "job_postion": "", "callback": "show_contact", "is_batch_view_resume": 0 }) open(session.temp_folder + os.path.sep + 'message.html', 'w').write(message) if '您已下载过该简历' not in message: if '简历下载数不足' in message: raise Exception('The Ganji Account Can Not Afford this Resumes') elif '此帖子已删除' in message: raise Exception('The Ganji Resume Deleted') else: assert '确认查看' in message buy_url = 'http://www.ganji.com/findjob/download_resume.php?source=detail&resume_type=0&findjob_puid=%s&job_postion=&callback=show_contact&is_batch_view_resume=0' % resume_id tel_message = __session('post', buy_url, headers={ "Host": "www.ganji.com", "Origin": "http://www.ganji.com", "Referer": buy_url, "Upgrade-Insecure-Requests": 1, }, data={"one_key_download_setting": 1}) assert 'tel-code' in tel_message logger.info('buy contact done, try upload resume') resume = __session('get', 'http://www.ganji.com/jianli/%sx.htm' % resume_id) shutil.rmtree(session.temp_folder) return upload(resume, 'ganji', get_contact=True)
def __download_resume(session, url, proxies=None): """ :param session: the session for download resume of ganji :param urls: the list of resume's url of ganji :param proxies: the proxies for download resume :return: yield the resume content """ __timeout = 30 host = url.split('/')[2] headers = { "User-Agent": nautil.user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Host": host, "Referer": "http://www.ganji.com/findjob/resume_list.php" } time.sleep(random.uniform(30, 100)) try_times = 0 while True: try_times += 1 try: resume_data = session.get(url, headers=headers, timeout=__timeout, proxies=proxies) assert resume_data.status_code == 200 resume_data.encoding = 'utf-8' resume = resume_data.text resume_failues = u"您的访问速度太快了,如果您不是机器的话,输入下面的验证码来继续访问吧" error_message = u"亲爱的用户,您访问的速度太快" if error_message in resume: raise Exception("ERROR_MESSAGE!") if resume_failues in resume: verify_headers = { "User-Agent": headers["User-Agent"], "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Host": host, "Referer": url } img = pq(resume).find('.error').find('span').find('img').attr( 'src') error_url = "http://" + host + img verify_code = parse_check_code(session, error_url, 'ganji', proxies) response = session.post(error_url, data=verify_code, headers=verify_headers, timeout=__timeout) if u"对不起!您要查看的页面没有找到或已删除" in response.text: break if u"对不起,该简历已停止找工作了~" in response.text: break assert response.status_code == 200 continue except Exception: logger.warning('fetch %s with %s fail:\n%s' % (url, proxies, traceback.format_exc())) if try_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(30) else: # raise Exception("SPEED_TOO_FAST!") return resume_data.text
def __fetch_contact(session, resume_id, user_name, user_password, proxies=None): user_agent = nautil.user_agent() proxies = None def __session(method, url, headers={}, data=None): logger.info('------\nRequesting %s On %s With Data:\n%s\n------' % (method, url, data)) time.sleep(random.uniform(4, 15)) assert method in ('get', 'post') assert method == 'post' or not data request_headers = { "User-Agent": user_agent, "Origin": "http://jianli.58.com", } for k, v in headers.iteritems(): request_headers[k] = v if method == 'get': response = session.get(url, headers=request_headers, proxies=proxies) if method == 'post': response = session.post(url, headers=request_headers, proxies=proxies, data=data) assert response assert response.status_code == 200 response.encoding = 'utf-8' return response.text main_page = __session('get', 'http://my.58.com/index') if '普通登录方式' in main_page: logger.info('cookie fail, try login') # login_cookies = get_cookie('x58', user_name) login_cookies = __login(user_name, user_password, user_agent, proxies=proxies) assert isinstance(login_cookies, list) login_cookie_jar = requests.cookies.RequestsCookieJar() for login_cookie in login_cookies: login_cookie_jar.set(login_cookie['name'], login_cookie['value'], domain=login_cookie['domain'], path=login_cookie['path']) session.cookies.update(login_cookie_jar) message = __session( 'get', 'http://jianli.58.com/resumemsg/?resumeid=%s&rand_code=%s&f=' % (resume_id, random.random())) if '您好,此求职者只允许在58同城认证营业执照的企业查看和下载' in message: raise Exception('Need Certification of Business Licence') if '您可直接查看本简历' not in message: remain = re.search( ur"""您目前共有 <span class='f-f1a'>(\d+)</span> 份简历可下载""", message) assert remain and remain.group( 1).isdigit(), 'Unexpected Message \n%s' % message remain = int(remain.group(1)) if remain < 5: raise Exception( 'The 58 Accoun Remains Only %s Resumes To Download' % remain) logger.info("获取联系方式.....") tel = __session( 'get', 'http://jianli.58.com/ajax/resumemsg/?operate=userdown&rid=%s' % resume_id, headers={ "Referer": "http://jianli.58.com/resumemsg/?resumeid=%s&rand_code=%s&f=" % (resume_id, random.random()) }) open(session.temp_folder + os.path.sep + 'tel.html', 'w').write(tel) if '您可直接查看本简历' not in tel: assert re.search('>([\d ]*)</span', tel), 'TEL NOT FOUND in html:\n%s' % tel # tel = tel.group(1).replace(' ', '') logger.info('fetch done, try upload resume') resume = __session('get', 'http://jianli.58.com/resume/%s/' % resume_id) open(session.temp_folder + os.path.sep + 'resume.html', 'w').write(resume) shutil.rmtree(session.temp_folder) return upload(resume, 'x58', get_contact=True)
def __get_resume_urls(session, urls, dedup, proxies=None): """ :param session: the session of find the resume's url :param urls: get the urls filter by the naren's searcher engin :param proxies: the poxies for get the resume's href :return: """ _resume_counter = 0 headers = { "User-Agent": nautil.user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Host": "www.ganji.com", "Referer": "http://www.ganji.com/findjob/resume_index.php" } # pages = [0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448] resume_300_flag = 0 for uu in urls: proxy_error_counter = 0 last_resume_ids = set("None") for page in xrange(0, 640, 32): if resume_300_flag == 1: break time.sleep(random.uniform(30, 100)) url = uu + '&page=%s' % page _timeout = 30 try_times = 0 try_parse_times = 0 while True: try_times += 1 try: logger.warning('fetching %s with %s' % (url, proxies)) response = session.get(url, headers=headers, timeout=_timeout, proxies=proxies) assert response.status_code == 200 resume_failues = u"您的访问速度太快了,如果您不是机器的话,输入下面的验证码来继续访问吧" error_message = u"亲爱的用户,您访问的速度太快" if error_message in response.text: raise Exception("ERROR_MESSAGE!") if resume_failues in response.text: verify_headers = { "User-Agent": headers["User-Agent"], "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Host": "www.ganji.com", "Referer": url } img = pq(response.text).find('.error').find( 'span').find('img').attr('src') error_url = 'http://www.ganji.com' + img try_parse_times += 1 verify_code = parse_check_code(session, error_url, 'ganji', proxies) data = session.post(error_url, data=verify_code, headers=verify_headers, timeout=_timeout) assert data.status_code == 200 except Exception: logger.warning('fetch %s with %s fail:\n%s' % (url, proxies, traceback.format_exc())) if try_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(30) else: if try_parse_times > random.randint(2, 5): raise Exception("PROXY_FAIL!") # raise Exception("SPEED_TOO_FAST!") break response.encoding = 'utf-8' response_hrefs = pq( response.content).find('.resume-list').find('div').find('dl') resume_names_urls = {} resume_ids_urls = {} for response_href in response_hrefs: href = pq(response_href).find('a').attr('href') href_id = re.findall(r'\d+', href)[0] name = pq(response_href).find('a').text() resume_names_urls[name] = href resume_ids_urls[href_id] = href if not last_resume_ids.difference(set(resume_ids_urls.keys())): proxy_error_counter += 1 if proxy_error_counter > 5: raise Exception("PROXY_FAIL!") if resume_ids_urls: last_resume_ids = set(resume_ids_urls.keys()) rest_ids = dedup(resume_ids_urls.keys()) # 简历去重 for id in rest_ids: _resume_counter += 1 if _resume_counter < 300: rest_url = resume_ids_urls[id] else: resume_300_flag = 1 break resume = __download_resume(session, rest_url, proxies=proxies) if resume: yield resume if u'class="next">下一页</a>' in response.text: continue else: break
def __fetch_contact(session, resume_id, user_name, user_password, proxies=None): user_agent = nautil.user_agent() proxies = None def __session(method, url, headers={}, data=None): logger.info('------\nRequesting %s On %s With Data:\n%s\n------' % (method, url, data)) time.sleep(random.uniform(4, 15)) assert method in ('get', 'post') assert method == 'post' or not data request_headers = { "User-Agent": user_agent, "Origin": "http://jianli.58.com", } for k, v in headers.iteritems(): request_headers[k] = v if method == 'get': response = session.get(url, headers=request_headers, proxies=proxies) if method == 'post': response = session.post(url, headers=request_headers, proxies=proxies, data=data) assert response assert response.status_code == 200 response.encoding = 'utf-8' return response.text main_page = __session('get', 'https://passport.58.com/login') print main_page if '普通登录方式' in main_page: logger.info('cookie fail, try login') __session( 'post', 'http://passport.58.com/douilogin', headers={ "Referer": "http://jianli.58.com/weixinlogin.html?path=http://jianli.58.com/resumemsg/?resumeid=%s&rand_code=%s&f=" % (resume_id, random.random()), }, data={ "domain": "58.com", "callback": "handleLoginResult", "sysIndex": "1", "pptusername": user_name, "pptpassword": user_password, "pptvalidatecode": "" }) message = __session( 'get', 'http://jianli.58.com/resumemsg/?resumeid=%s&rand_code=%s&f=' % (resume_id, random.random())) if '您好,此求职者只允许在58同城认证营业执照的企业查看和下载' in message: raise Exception('Need Certification of Business Licence') if '您可直接查看本简历' not in message: remain = re.search( ur"""您目前共有 <span class='f-f1a'>(\d+)</span> 份简历可下载""", message) assert remain and remain.group( 1).isdigit(), 'Unexpected Message \n%s' % message remain = int(remain.group(1)) if remain < 5: raise Exception( 'The 58 Accoun Remains Only %s Resumes To Download' % remain) tel = __session( 'get', 'http://jianli.58.com/ajax/resumemsg/?operate=userdown&rid=%s' % resume_id, headers={ "Referer": "http://jianli.58.com/resumemsg/?resumeid=%s&rand_code=%s&f=" % (resume_id, random.random()) }) if '您可直接查看本简历' not in tel: assert re.search('>([\d ]*)</span', tel), 'TEL NOT FOUND in html:\n%s' % tel # tel = tel.group(1).replace(' ', '') logger.info('fetch done, try upload resume') resume = __session('get', 'http://jianli.58.com/resume/%s/' % resume_id) return upload(resume, 'x58', get_contact=True)
def __login(session, user_name, user_password, proxies=None): user_agent = nautil.user_agent() proxies = None def __session(method, url, headers={}, data=None): logger.info('------\nRequesting %s On %s With Data:\n%s\n------' % (method, url, data)) # time.sleep(random.uniform(4, 15)) assert method in ('get', 'post') assert method == 'post' or not data request_headers = { "User-Agent": user_agent, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Content-Type": "application/x-www-form-urlencoded", "Upgrade-Insecure-Requests": "1", } for k, v in headers.iteritems(): request_headers[k] = v if method == 'get': response = session.get(url, headers=request_headers, proxies=proxies) if method == 'post': response = session.post(url, headers=request_headers, proxies=proxies, data=data) # assert response assert response.status_code == 200 response.encoding = 'utf-8' return response.text main_page = __session('get', 'https://passport.58.com/login/') print main_page if '普通登录方式' in main_page: logger.info('cookie fail, try login') response_text = __session( 'post', 'https://passport.58.com/login/dologin', headers={ "Host": "passport.58.com", "Origin": "http://passport.58.com", "Referer": "http://passport.58.com/login", }, data={ "isweak": "0", "source": "", # "domain": "58.com", "callback": "successFun", "yzmstate": "", "fingerprint": "_000", "username": user_name, "password": __passwd(str(int(round(time.time() * 1000))) + user_password), # "password": "******", # "pptvalidatecode": "" }) print response_text
def fetch_contact_impl(resume_id, user_name, passwd, proxies=None, logger_name=None): if logger_name: global logger logger = logging.getLogger(logger_name) __timeout = 30 # proxies = {'http': 'http://120.26.80.194:60762', 'https': 'http://120.26.80.194:60762'} user_agent = nautil.user_agent() result, session = login(user_name, passwd, proxies=proxies) if not result: return session session.temp_folder = os.path.join(tempfile.gettempdir(), "naren", str(random.randint(1, 10000))) if not os.path.isdir(session.temp_folder): os.makedirs(session.temp_folder) result = __fet_contanct(session, resume_id, user_agent, proxies=proxies) if result["err_code"] != 0: return result url = "http://ehire.51job.com/%s" % result["err_msg"] resume_page_result = __get_resume_page(session, url, proxies=proxies) if resume_page_result["err_code"] != 0: return resume_page_result resume_page_text = resume_page_result["err_msg"] tel_mail = pq(resume_page_text).find(".infr").text() if u"电 话:" in resume_page_text and u"E-mail:" in resume_page_text: logger.info("简历联系方式已存在") shutil.rmtree(session.temp_folder) return upload(resume_page_text, "j51", get_contact=True, logger_in=logger) if "*" not in tel_mail: logger.info("简历联系方式已存在") shutil.rmtree(session.temp_folder) return upload(resume_page_text, "j51", get_contact=True, logger_in=logger) is_download = pq(resume_page_text).find( ".btn_down[id=UndownloadLink]").attr("onclick") if not is_download: logger.warning("当前账号没有下载权限,获取简历页失败") return {"err_code": 101, "err_msg": "当前账号没有下载权限!"} if u"点击查看联系方式!" in resume_page_text and u"简历信息" in resume_page_text: post_data = { "doType": "SearchToCompanyHr", "userId": resume_id, "strWhere": "", } post_headers = { "Accept": "application/xml, text/xml, */*", "Accept-Encoding": "gzip,deflate", "Content-Type": "application/x-www-form-urlencoded", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "ehire.51job.com", "Origin": "http://ehire.51job.com", "Referer": url, "X-Requested-With": "XMLHttpRequest", } logger.info("获取简历详情......") resume_text = session.post( url="http://ehire.51job.com/Ajax/Resume/GlobalDownload.aspx", headers=post_headers, data=post_data, timeout=__timeout, proxies=proxies) if u"不属于以上地区" in resume_text.text: return { "err_code": 20022, "err_msg": "对不起,您暂时不能下载该份简历,原因是:您选中的简历中存在应聘者所在地超出合同范围的情况。请核实您的情况,若有疑问请与销售或客服人员联系。" } if resume_text.status_code != 200: return {"err_code": 20019, "err_msg": "获取简历失败"} resume_text.encoding = "utf-8" resume_result = __get_resume_page(session, url, proxies=proxies) logger.info('fetch resume_id %s done, try upload resume' % resume_id) shutil.rmtree(session.temp_folder) return upload(resume_result["err_msg"], "j51", get_contact=True, logger_in=logger) else: return {"err_code": 20020, "err_msg": "抱歉,没有搜到您想找的简历!"}
def register(self): int_samples = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9') char_samples = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') register_password = random.choice(char_samples) for i in xrange(random.randint(3, 6)): register_password += random.choice(int_samples + char_samples) for i in xrange(random.randint(2, 5)): register_password += random.choice(int_samples) logger.info('random passwd %s' % register_password) try_times = 0 while True: session = requests.Session() session.headers.update({ "User-Agent": nautil.user_agent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': 1 }) session.get('http://www.jianlika.com/Signup/email.html', headers={ "Host": "www.jianlika.com", }, proxies=self.proxies) verify_code = parse_check_code( session, "http://www.jianlika.com/Verify/index.html?%s" % random.randint(1000000000000000, 9000000000000000), "jianlika", self.proxies, headers={ "Accept": "image/webp,image/*,*/*;q=0.8", "Host": "www.jianlika.com", "Referer": "http://www.jianlika.com/Signup/email.html", }, typeid=3050) response = session.post( 'http://www.jianlika.com/Signup/email.html', headers={ "Content-Length": "85", "Host": "www.jianlika.com", "Origin": "http://www.jianlika.com", "Referer": "http://www.jianlika.com/Signup/email.html", }, data={ "email": self.username, "pwd": register_password, "repwd": register_password, "verifycode": verify_code, "invitecode": "", "agree": "on" }, proxies=self.proxies) response.encoding = 'utf-8' if '您今天注册次数已超限' in response.text: raise Exception('REGISTER_OVERLOAD') if '验证码不正确' in response.text: if try_times > 5: raise Exception('CHECKCODE_FAIL') else: try_times += 1 continue if '此邮箱已被使用' in response.text: return 'REGISTERED!' assert '邮件已发送至' in response.text, '--unknown registered page---\n%s\n--unknown registered page---' % response.text break email_link = self.fetch_email() response = session.get(email_link, headers={ 'Host': 'www.jianlika.com', }, proxies=self.proxies) response.encoding = 'utf-8' if '此邮箱不存在' in response.text: raise Exception('REGISTER_TOO_LATE') if '此邮箱不需要激活' in response.text: logger.info('此邮箱不需要激活') if '邮箱验证成功' in response.text: logger.info('邮箱验证成功') return register_password