def next_html(account_cookies, data, retry): logger.info('开始进行下一页 %s %s' % ( account_cookies.get('userName', ''), retry, )) global proxy if retry <= 0: return None cookie = account_cookies.get('cookie') headers = arouse_utils.get_get_headers() url = 'http://ehire.51job.com/Candidate/SearchResumeNew.aspx' logger.info(proxy) result = utils.download(url=url, data=data, proxy=proxy, cookie=cookie, headers=headers, method='post') if result['code'] != 0: logger.error("连接页面异常 ,重试: retry= %s" % retry) proxy = utils.get_proxy() return next_html(cookie, data, retry - 1) elif '用户数不够' in result['data'] or len(result['data']) < 1000: logger.error("代理异常,重试: retry= %s" % retry) proxy = utils.get_proxy() return next_html(cookie, data, retry - 1) if '您的操作过于频繁,请注意劳逸结合' in result['data']: return None return result['data']
def go_to_search_html(cookie, retry): global proxy if retry <= 0: return None logger.info('跳转搜索前置页面中------%s ' % retry) url = 'http://ehire.51job.com/Candidate/SearchResumeIndexNew.aspx' headers = arouse_utils.get_get_headers( 'http://ehire.51job.com/Navigate.aspx') if not proxy: proxy = utils.get_proxy() logger.info(proxy) utils_download = utils.download(url=url, headers=headers, proxy=proxy, cookie=cookie) if utils_download['code'] != 0: logger.error('搜索页面出错:%s %s' % (url, retry)) if utils_download.get( 'data' ) and '<a href="/MainLogin.aspx?returl=' in utils_download['data']: return 'login' proxy = utils.get_proxy() return go_to_search_html(cookie, retry - 1) if '<a href="/MainLogin.aspx?returl=' in utils_download['data']: return 'login' viewstate = arouse_utils.find( '<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', utils_download['data']) if not viewstate: proxy = utils.get_proxy() go_to_search_html(cookie, retry - 1) return viewstate
def go_to_list_html(param, cookie, viewstate, retry): global proxy logger.info('搜索前置页面-开始初始搜索------ %s' % retry) if retry <= 0: return None url = 'http://ehire.51job.com/Candidate/SearchResumeNew.aspx' # param = {"function_code": "0107", "functionName": "软件工程师", "region_code": "010000", # "regionName": "北京"} data = arouse_utils.get_frist_post_headers(viewstate, param=param) logger.info(proxy) result = utils.download(url=url, data=data, proxy=proxy, cookie=cookie, method='post') if result['code'] != 0: logger.error("连接页面异常 ,重试: retry= %s" % retry) proxy = utils.get_proxy() return go_to_list_html(param, cookie, viewstate, retry - 1) elif '用户数不够' in result['data'] or len(result['data']) < 1000: logger.error("代理异常,重试: retry= %s" % retry) proxy = utils.get_proxy() return go_to_list_html(cookie, viewstate, retry - 1) if '您的操作过于频繁,请注意劳逸结合' in result['data']: return None return result['data']
def refer_list_html(account, data, retry): """ 刷新下一页 :param resume_ids: :param retry: :return: 刷新后的列表页面 """ global proxy if retry <= 0: return None headers = inbox_utils.get_headers( 'http://ehire.51job.com/Inbox/InboxRecentEngine.aspx?Style=1') headers['Cookie'] = account['cookies'] proxy = common_utils.get_proxy() url = 'http://ehire.51job.com/Inbox/InboxRecentEngine.aspx?Style=1' r = common_utils.download(url=url, headers=headers, data=data, method='post', cookie=account['cookie'], proxy=proxy) if r.get('code') != 0: logger.error("列表页面返回不正常 %s" % r.get('data')) if r.get('data') and '<a href="/MainLogin.aspx?returl=' in r['data']: return 'login' proxy = common_utils.get_proxy() return refer_list_html(account, data, retry - 1) if '<a href="/MainLogin.aspx?returl=' in r['data']: return 'login' return r.get('data')
def run(self): profile, proxy = get_profile(self.profile), get_proxy(self.proxies) if profile == None: self.status_signal.emit({ "msg": "Invalid profile", "status": "error" }) return if proxy == None: self.status_signal.emit({ "msg": "Invalid proxy list", "status": "error" }) return if self.site == "Walmart": Walmart(self.task_id, self.status_signal, self.image_signal, self.product, profile, proxy, self.monitor_delay, self.error_delay, self.max_price) elif self.site == "Bestbuy": BestBuy(self.task_id, self.status_signal, self.image_signal, self.product, profile, proxy, self.monitor_delay, self.error_delay) elif self.site == "Target": Target(self.task_id, self.status_signal, self.image_signal, self.product, profile, proxy, self.monitor_delay, self.error_delay)
def conn_html(account, url, retry, refer_url=None, track_id=None): """ 连接搜索列表 :return: """ global proxy logger.info("开始连接 %s , 重试次数 %s" % (url, retry)) if retry <= 0: return None headers = inbox_utils.get_headers(refer_url) # headers['Cookie'] = account['cookie'] # r = requests.get(url, headers=headers, timeout=10) r = common_utils.download(url=url, headers=headers, cookie=account['cookie'], proxy=proxy) if r.get("code") != 0: logger.error("列表页面返回不正常 data= %s" % r.get('data')) if r.get('data') and '<a href="/MainLogin.aspx?returl=' in r['data']: return 'login' proxy = common_utils.get_proxy() return conn_html(account, url, retry - 1, refer_url, track_id) if '<a href="/MainLogin.aspx?returl=' in r['data']: return 'login' return r.get('data')
def run(self): profile, proxy = get_profile(self.profile), get_proxy(self.proxies) if profile is None: self.status_signal.emit({ "msg": "Invalid profile", "status": "error" }) return if proxy is None: self.status_signal.emit({ "msg": "Invalid proxy list", "status": "error" }) return if self.site == "Walmart": Walmart(self.task_id, self.status_signal, self.image_signal, self.wait_poll_signal, self.wait_condition, self.product, profile, proxy, self.monitor_delay, self.error_delay, self.max_price) elif self.site == "Bestbuy": BestBuy(self.status_signal, self.image_signal, self.product, profile, proxy, self.monitor_delay, self.error_delay) #TODO: Readd Discord Webhook elif self.site == "Target": Target(self.task_id, self.status_signal, self.image_signal, self.product, profile, proxy, self.monitor_delay, self.error_delay) elif self.site == "GameStop": GameStop(self.task_id, self.status_signal, self.image_signal, self.product, profile, proxy, self.monitor_delay, self.error_delay, self.max_price)
def save_mobile_imgs_to_oss(img_url, retry, trackId, headers=None): """ 连接获取电话图片保存oss :return: """ logger = common_utils.get_logger() logger.error("准备连接电话图片: %s" % trackId) if retry <= 0: return None try: r = requests.get(img_url, proxies=common_utils.get_proxy(), timeout=8) except Exception as e: logger.error(e) logger.error("连接电话图片异常: %s 重试" % trackId) return save_mobile_imgs_to_oss(img_url, retry - 1, trackId) # 存储oss logger.error("准备存储oss: %s " % trackId) auth = oss2.Auth('LTAIa3y58SBV0Kyn', 'yBZcBKhQTgtf4cV55ljpnNCSk1XWaI') bucket = oss2.Bucket(auth, 'http://oss-cn-beijing.aliyuncs.com', 'ocr-img') # oss_api = OssAPI('http://oss-cn-beijing.aliyuncs.com', 'LTAIa3y58SBV0Kyn', 'yBZcBKhQTgtf4cV55ljpnNCSk1XWaI') oss_addr = 'spider/FIVE_ONE/RESUME_INBOX/' + str(uuid.uuid1()) + '.jpg' try: # oss_api.put_object('ocr-img', r, oss_addr) bucket.put_object(oss_addr, r) except Exception as e: logger.error(traceback.format_exc()) return save_mobile_imgs_to_oss(img_url, retry - 1, trackId) return oss_addr
def get_page(url): utils.get_logger().info('get_page url %s' % url) for x in xrange(3): try: proxy = utils.get_proxy() utils.get_logger().error('get_page[use proxy %s]' % proxy) session = requests.session() content = session.get(url, headers={ 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 'Accept': 'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', }, proxies=proxy, timeout=10).content if content: utils.get_logger().info('[the page use proxy %s] ' % proxy) if '验证码' in content: utils.get_logger().info('[the page needs input validate code %s]' % url) else: return {'content': content, 'proxy': proxy} else: utils.get_logger().info('[request returns null page %s]' % url) except Exception as e: utils.get_logger().error(str(traceback.format_exc())) return None
def start_one_job(account): """开启一个帐号的任务 :account dict : 帐号,含有cookie """ global proxy track_id = str(uuid.uuid1()) url = "http://ehire.51job.com/Inbox/InboxRecentEngine.aspx?Style=1" refer_url = "http://ehire.51job.com/Navigate.aspx?ShowTips=11&PwdComplexity=N" proxy = common_utils.get_proxy() list_html = conn_html(account, url, 5, refer_url=refer_url, track_id=track_id) # list_html = open('text_htl').read() # 测试 while True: if list_html: if 'login' == list_html: # 需要登录 logger.error("出现登录页面 %s" % account['userName']) return 'login' else: hidEngineCvlogIds = common_utils.find( '<input name="hidEngineCvlogIds" type="hidden" id="hidEngineCvlogIds" value="(.*?)" />', list_html) __VIEWSTATE = common_utils.find( '<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', list_html) resume_ids = parse_list_html(list_html, track_id=track_id) if 'none' == resume_ids: logger.info("邮箱没有邮件了--%s" % account['userName']) return 'over' elif 'refer-login' == resume_ids: logger.error("出现登录页面 %s" % account['userName']) return 'login' if resume_ids: ids_for = list(resume_ids) logger.info('简历个数: %s' % len(resume_ids)) for id in ids_for: info_url = 'http://ehire.51job.com/Candidate/ResumeViewFolder.aspx?hidSeqID=%s&hidFolder=EMP' % id flag = info_main(account, info_url, track_id) # flag = True if 'login' == flag: logger.error("出现登录页面 %s" % account['userName']) return 'login' if not flag: # 失败? resume_ids.remove(id) # 测试 # resume_ids = ['9229836941', ] data = get_refer_data(resume_ids, __VIEWSTATE, hidEngineCvlogIds) # list_html = refer_list_html(account, data, 4) else: # 解析失败 logger.error("页面 未能解析出简历%s" % account['userName']) return 'error' else: # 解析失败 logger.error("出现错误页面 %s" % account['userName']) return 'error'
def get_page(url, header): logger.info('get_page url %s' % url) for x in xrange(3): try: proxy = utils.get_proxy() logger.info('get_page[use proxy %s]' % proxy) session = requests.session() content = session.get(url, headers=header, proxies=proxy, timeout=10).content if content: logger.info('[the page use proxy %s] ' % proxy) if '验证码' in content or '机器人' in content: logger.info('[the page needs input validate code %s]' % url) return None else: return {'content': content, 'proxy': proxy} else: logger.info('[request returns null page %s]' % url) except Exception as e: logger.error(str(traceback.format_exc())) return None
def get_list(city=None, zone=None, money=None, education=None, experience=None, size=None, page_now=None, is_get_zone=None, proxy=None, jobtitle=None, **kwargs): logger = utils.get_logger() proxy = proxy if proxy else utils.get_proxy() # proxy = utils.get_proxy() # logger.info('split_list_thread start!!!') result = {'code': 0} # list_url = 'http://www.zhipin.com/%s/e_105-d_203-s_302-y_4-b_%E6%9C%9D%E9%98%B3%E5%8C%BA/?page=%s&ka=page-next' city_param = 'c' + str(city) zone_param = 'b_' + str(zone) if zone else '' # industry_param = 'i'+str(industry)+'-' if industry else '' jobtitle_param = '-p'+str(jobtitle) if jobtitle else '' money_param = 'y_'+str(money)+'-' if money else '' education_param = 'd_'+str(education)+'-' if education else '' experience_param = 'e_'+str(experience)+'-' if experience else '' size_param = 's_'+str(size)+'-' if size else '' # list_url = 'http://www.zhipin.com/%s/?page=%s' % (city, page_now) if experience_param or education_param or size_param or money_param or zone_param: list_url = 'http://www.zhipin.com/'+city_param+jobtitle_param+'/'+experience_param+education_param+size_param+money_param+zone_param+'/?page='+str(page_now)+'&ka=page-next' else: list_url = 'http://www.zhipin.com/'+city_param+jobtitle_param+'/?page='+str(page_now)+'&ka=page-next' logger.info('the url, proxy is'+list_url+' '+str(proxy)) # time.sleep(2) list_header = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Host':'www.zhipin.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', } while True: logger.info('download url:'+list_url) try: response = requests.get(list_url, headers=list_header, allow_redirects=False, proxies=proxy, timeout=10) if response.status_code in [200, '200']: if len(response.text) < 1024: logger.info('get '+response.text) else: break else: logger.info('not get 200 when download list!!!'+ str(response.status_code)) # result['code']=1 except Exception, e: logger.info(str(traceback.format_exc())) proxy.update(utils.get_proxy())
def login_bypass_ip_limit(): """ 针对限制了ip访问次数的登录进行爆破 :return: """ # 开始登录 global CUR_PROXY try: login_info = dict_queue.get(block=False) except: return username = login_info[0] # 如果这个用户名已经被爆破出来密码,那么跳过这个用户名 if username in success_username: return password = login_info[1] payload = { "username": username, "password": password, } print('开始尝试用户名:{},密码:{}'.format(username, password)) S = requests.Session() while True: try: response = S.post(settings.LOGIN_LIMIT_IP_URL, data=payload, proxies=CUR_PROXY, timeout=5) if response.status_code == 200: # 正常获取到了服务器请求 soup = BeautifulSoup(response.text, 'lxml') if soup.find('span', text='对不起,您的访问过于频繁,请等待60秒后再操作!'): # 捕获到了限制信息,改用代理登录 print("被限制了") CUR_PROXY = utils.get_proxy() print("当前使用代理:{}".format(CUR_PROXY)) continue elif soup.find('a', attrs={'id': 'backdoor'}): # 用户名密码正确,成功登录 success_queue.put(payload) success_username.append(username) print('【爆破成功,用户名:{},密码:{}】'.format(username, password)) # stop_brute() return True elif soup.find('span', text='用户名或密码错误!'): # print("用户名或密码错误") return False else: pass else: print("连接异常") except Exception as e: print(e)
def __init__(self): self.twitterTokens = get_tokens() http_proxy, https_proxy = get_proxy() self.api = twitter.Api(consumer_key=self.twitterTokens['consumer_key'], consumer_secret=self.twitterTokens['consumer_secret'], access_token_key=self.twitterTokens['access_token'], access_token_secret=self.twitterTokens['access_token_secret'], proxies={'http':http_proxy, 'https':https_proxy})
def get_detail_html(url): for i in range(5): try: response = requests.get(url, headers=headers, proxies=get_proxy()) if response.status_code == 200: response.encoding = "gbk" print("当前运行url为-{}".format(url)) return response.text except: pass
def get_page(url, header, proxy=None): logger.info('get_page url %s' % url) for x in xrange(4): try: if not proxy: proxy = utils.get_proxy() logger.info('get_page[use proxy %s]' % proxy) session = requests.session() content = session.get(url=url, headers=header, proxies=proxy, timeout=5).content if content: logger.info('[the page use proxy %s] ' % proxy) return {'content': content, 'proxy': proxy} else: logger.info('[request returns null page %s]' % url) except Exception as e: logger.error(str(traceback.format_exc())) proxy = utils.get_proxy() return None
def process(task): """ 根据一个搜索条件开始一项搜索 :return: """ global logger if project_settings.get('useAby'): getproxies_ = project_settings.get('aby') else: getproxies_ = utils.get_proxy() logger = utils.get_logger() param_dict = json.loads(task['data'][0]['executeParam'], encoding="utf-8") result = {'code': 0} track_id = str(uuid.uuid1()) page_num = 1 if param_dict['page_num']: page_num = param_dict['page_num'] while True: url = get_list_url(param_dict, page_num) list_html_list = get_html(url, 5, track_id, getproxies_) if list_html_list: logger.info("list_html success when download: " + url) info_list = parse_list_html(list_html_list[0], track_id, page_num) else: # 页面不正常 logger.error(u"列表页面获取失败: url=%s" % url) param_dict['page_num'] = page_num result['executeResult'] = 'list_html_error' result['executeParam'] = json.dumps(param_dict, ensure_ascii=False).encode() result['code'] = 1 return result if 'none_jd' == info_list: # 抓取完了 logger.info("此搜索条件无新职位可用: url=%s" % url) logger.info('没有符合条件的职位 %s' % json.dumps(param_dict)) result['executeResult'] = u'正常完毕' return result else: for info in info_list: try: info_mian(param_dict, info, track_id, getproxies_) except Exception, e: logger.error(traceback.extract_stack()) page_num += 1
def get_detail_html(url): # url = "http://www.chinacar.com.cn/serv/list_0_0_0_0_{}.html".format(page) for i in range(5): try: response = requests.get(url, headers=headers, stream=True, proxies=get_proxy(), timeout=10) if response.status_code == 200: print("当前运行url为-{}".format(url)) # print("ip=======", response.raw._connection.sock.getpeername()) return response.text except: pass
def start(): if conf.psyco: try: import psyco psyco.full() logger.critical("Enabling psyco support.") except: logger.critical( "Looks like psyco is not installed in your system. Psyco acceleration will not be enabled." ) pass if conf.http_proxy: try: conf.http_proxy = utils.get_proxy(conf.http_proxy) except: logger.critical( "Invalid format of HTTP-proxy. No proxy will be used.") conf.http_proxy = None logger.critical("Starting the caching resolver in a separate thread.") resolver.start(['mrim.mail.ru', 'avt.foto.mail.ru']) while 1: try: xmpp_con = transport.XMPPTransport(conf.name, conf.disconame, conf.server, conf.port, conf.passwd, logger) logger.critical("Connecting to XMPP server") xmpp_con.run() except KeyboardInterrupt: logger.critical('Got SIGINT, closing connections') xmpp_con.stop() try: os.unlink(conf.pidfile) except OSError: pass logger.critical('Shutdown') break except: traceback.print_exc() logger.critical("Connection to server lost") logger.critical("Trying to reconnect over 5 seconds") try: xmpp_con.stop(notify=False) del xmpp_con except: traceback.print_exc() pass time.sleep(5)
def run(self): profile, proxy = get_profile(self.profile), get_proxy(self.proxies) if profile is None: self.status_signal.emit({ "msg": "Invalid profile", "status": "error" }) return if proxy is None: self.status_signal.emit({ "msg": "Invalid proxy list", "status": "error" }) return if self.site == "Bestbuy": BestBuy(self.status_signal, self.image_signal, self.product, profile, proxy, self.monitor_delay, self.error_delay)
def get_topology_by_rg(helper, access_token, subscription_id, api_version, resourceGroupName, networkWatcherName, targetResourceGroupName): url = "https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/networkWatchers/%s/topology?api-version=%s" % ( subscription_id, resourceGroupName, networkWatcherName, api_version) header = {'Authorization': 'Bearer ' + access_token} proxies = utils.get_proxy(helper, "requests") try: r = requests.post( url, headers=header, proxies=proxies, json={'targetResourceGroupName': targetResourceGroupName}) r.raise_for_status() topology = json.loads(r.text) except Exception, e: raise e
def start(): if conf.psyco: try: import psyco psyco.full() logger.critical("Enabling psyco support.") except: logger.critical("Looks like psyco is not installed in your system. Psyco acceleration will not be enabled.") pass if conf.http_proxy: try: conf.http_proxy = utils.get_proxy(conf.http_proxy) except: logger.critical("Invalid format of HTTP-proxy. No proxy will be used.") conf.http_proxy = None logger.critical("Starting the caching resolver in a separate thread.") resolver.start(["mrim.mail.ru", "avt.foto.mail.ru"]) while 1: try: xmpp_con = transport.XMPPTransport(conf.name, conf.disconame, conf.server, conf.port, conf.passwd, logger) logger.critical("Connecting to XMPP server") xmpp_con.run() except KeyboardInterrupt: logger.critical("Got SIGINT, closing connections") xmpp_con.stop() try: os.unlink(conf.pidfile) except OSError: pass logger.critical("Shutdown") break except: traceback.print_exc() logger.critical("Connection to server lost") logger.critical("Trying to reconnect over 5 seconds") try: xmpp_con.stop(notify=False) del xmpp_con except: traceback.print_exc() pass time.sleep(5)
def get_verifycode(self, client=object, imageUrl=''): # 填写自己的 baidu-ocr api信息 APP_ID = 'xxxxxxxxxx' API_KEY = 'xxxxxxxxxxxx' SECRET_KEY = 'xxxxxxxxxxxxxxx' options = {} options["recognize_granularity"] = "big" options["detect_direction"] = "true" client = AipOcr(APP_ID, API_KEY, SECRET_KEY) retry_times = 3 i = 0 while i < retry_times: i += 1 try: # image = get_file_content(tmpImageName) image = self.sess.get(imageUrl, proxies=utils.get_proxy(), headers=self.a_task.M_HEADERS, cookies=self.a_task.M_COOKIES, verify=False) # open('vc____.jpg', 'wb').write(image.content) response = client.numbers(image.content, options) debug_p('[get_verifycode] vc_code response=', response) # dict: {'log_id': 3705378724129786481, 'direction': 0, 'words_result_num': 1, 'words_result': [{'location': {'width': 78, 'top': 1, 'left': 13, 'height': 37}, 'words': '4217'}]} words_result = response['words_result'] verifycode = words_result[0].get('words', '') if not verifycode or len(verifycode) < 4: continue if len(verifycode) > 4: verifycode = verifycode[:4] debug_p('[get_verifycode] verifycode=', verifycode, 'i=', i) return verifycode except Exception as e: # speed up i += 1 debug_p('[get_verifycode] Exception', 'i=', i, 'traceback=', traceback.format_exc()) pass return '0000'
def download_page(url=None, method=None, header=None, refer=None, proxy=None): logger = utils.get_logger() result = {} # if not header: header = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 'Accept': 'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cookie': 'ZP_OLD_FLAG=true;' } if refer: header['Referer'] = refer for x in xrange(0, 3): # proxy = utils.get_proxy() if project_settings.get('useAby'): proxy = project_settings.get('aby') else: proxy = utils.get_proxy() logger.info('download_page : %s ' % url) result = utils.download(url=url, headers=header, method=method, allow_redirects=True, retry_time=1, proxy=proxy) print result if result['code'] == 0: logger.info('success when download %s-%s ' % (proxy, url)) break time.sleep(1) result['proxy'] = '' return result
def _transfer_test(): PROXY_INDEX = 0 # Session Transfer Testing proxy, proxy_auth = get_proxy(PROXY_INDEX) service_args = [ '--proxy={}'.format(proxy), '--proxy-type=html', '--ignore-ssl-errors=true' ] if proxy_auth: service_args.append('--proxy-auth={}'.format(proxy_auth)) driver = webdriver.PhantomJS(executable_path=PHANTOM_JS_LOCATION, service_args=service_args) add_to_cart(driver, cc='US') # Test transferring session (go to ipecho.net to see ip, got to cart to see if in cart) transfer_session(driver, proxy, proxy_auth, user_agent=get_user_agent(PROXY_INDEX)) time.sleep(60 * 60)
def parse_list(data): logger = utils.get_logger() # url = data['url'] city_url = data['cityUrl'] page_num = data['pageNum'] flg = True while flg: url = build_page_url(data=data, page_num=page_num) logger.info('请求列表页 url : %s' % (url, )) if project_settings.get('useAby'): proxy = project_settings.get('aby') else: proxy = utils.get_proxy() results = download_page(url=url, method='get', proxy=proxy) proxy = results['proxy'] content = results['data'] if '暂时无符合您条件的职位' in content or '没有符合您要求的职位' in content: logger.info('没有符合条件的职位 %s' % json.dumps(data, ensure_ascii=False)) data['code'] = 200 flg = True break if '您要访问的页面暂时没有找到' in content: logger.info('页面没有找到,返回404 %s ' % url) data['code'] = 200 flg = True break if 'jobs.zhaopin.com' in city_url: flg = parse_list_v1(page=content, page_num=page_num, data=data, refer=url, proxy=proxy) else: flg = parse_list_v2(page=content, page_num=page_num, data=data, refer=url, proxy=proxy) # 有解析到正常数据 logger.info('解析列表页详情数据 返回结果 %s' % (json.dumps(flg, ensure_ascii=False))) if flg.has_key('status') and flg.get('status'): data['code'] = 200 if flg.has_key('detail_count') and flg.get('detail_count') > 0: page_num += 1 else: data['code'] = 200 flg = False break else: logger.info('列表页面访问失败 %s ' % url) data['code'] = 500 flg = False break # 对于职位很不错的列表页 直接跳出 if '以下职位也很不错' in content: flg = False logger.info('含有 以下职位也很不错 跳出循环') data['code'] = 200 break data['pageNum'] = page_num return data
def reserve_a_seat(self, m_libid, m_clssrm, m_seat_num, m_coordinate, all_lib_clssrm, get_hexcodejs_from_url, verify_key, reserve_url_prefix, lib_seat_info): # func_name = 'reserve_a_seat' + str('threadid='+str(self.threadID) + ' thread_name='+str(self.thread_name)+' counter='+str(self.counter)) func_name = '[r_s] thread='+str(self.thread_name)+'| ' debug_p(func_name, 'lib_seat_info=', lib_seat_info, 'libid', m_libid, 'coordinate', m_coordinate) requests_time_limit = 3 self.tmp_trace_dct = {} # # exec_time exec_ts = time.time() + 0.1 # '1564905302.6147149' millisecond = str(str(exec_ts).split('.')[-1])[:3] t = str(time.strftime("%Y.%m.%d_%H:%M:%S", time.localtime(exec_ts))) + '.' + millisecond self.tmp_trace_dct['exe_time'] = t # add lib_seat_info # self.task_result += lib_seat_info + '\n' # self.task_result += '执行:' + self.exe_time + '\n' # type = list [(lib, clssrm), ()...] candidate_libid_clssrm = [{}] if int(m_libid) > 0: candidate_libid_clssrm = [(m_libid, m_clssrm)] elif int(m_libid) < 0: # all_lib_clssrm dict{libid: clssrm} all_lib_clssrm.pop(m_libid[1:], 'default_value') candidate_libid_clssrm = random.sample(all_lib_clssrm.items(), min(self.request_num_limit, len(all_lib_clssrm))) else: candidate_libid_clssrm = random.sample(all_lib_clssrm.items(), min(self.request_num_limit, len(all_lib_clssrm))) debug_p(func_name, '||candidate libid=', candidate_libid_clssrm) # candidate_libid_clssrm = [(lib, clssrm), ()...] for (libid, clssrm) in candidate_libid_clssrm: self.tmp_trace_dct['libid'] = libid self.tmp_trace_dct['clssrm'] = clssrm if requests_time_limit <= 0: break ### test time.sleep(3) # debug_p(func_name, 'get_hexcodejs_from_url=', get_hexcodejs_from_url) # entry pre_seatmap_page # print('123', self.a_task.M_COOKIES) # if self.a_task.pattern == "PRE": get_hexcodejs_from_url = get_hexcodejs_from_url.format(libid=libid) else: # TODAY get_hexcodejs_from_url = get_hexcodejs_from_url.format(libid=libid, now_time=int(time.time())) html_seatmap = utils.get_response( url=get_hexcodejs_from_url, sess=self.sess, m_headers=self.a_task.M_HEADERS, m_cookies=self.a_task.M_COOKIES, verify_key=verify_key, platform=self.a_task.platform) # judge html_doc if not html_seatmap: # sessid invalid--> task completed self.tmp_trace_dct['code'] = 404 self.tmp_trace_dct['msg'] = '尝试进入座位表失败!可能:{不在预约时间, 服务器无响应, id失效}' # task failed, sessionid invalid debug_p(func_name, '[E]: pre_seatmap_page is none, get_hexcodejs_from_url='+get_hexcodejs_from_url) # info = '结果:{succ_failed}-【{msg}】\n' # info = info.format(succ_failed='FAILED', # msg='未知原因-进入座位表页面失败,请反馈管理员处理...') # self.task_result += info # sessionid invalid, task completed return True #, 'pre_seatmap_page is none, get_hexcodejs_from_url='+get_hexcodejs_from_url # get get_empty_seat # type = list [(seat_num, coordinate), (), ...] candidate_seat_crdnt = [] if int(libid) > 0 and int(m_seat_num) > 0: candidate_seat_crdnt = [(m_seat_num, m_coordinate)] elif int(libid) <= 0: # assert seat_num==0 candidate_seat_crdnt = self.get_empty_seat(html_seatmap=html_seatmap, number=1) elif int(m_seat_num) <= 0: # m_lib > 0 and m_seat_num <= 0 , get three candidate without m_seat_num candidate_seat_crdnt = self.get_empty_seat(html_seatmap=html_seatmap, number=self.request_num_limit, discard_seatnum=m_seat_num) else: pass if len(candidate_seat_crdnt) == 0: # no candidate seat crdnt continue debug_p(func_name, '||candidate seat=', candidate_seat_crdnt) # soup = BeautifulSoup(html_seatmap, 'html.parser') ### test open('lxz_seatmap.html', 'w').write(html_seatmap) # debug_p(func_name, '\n\nhtml_doc=', html_seatmap) # get hexch_js_code # hexch_js_url = [e for e in soup.find_all('script') if # str(e).find('https://static.wechat.v2.traceint.com/template/theme2/cache/layout') >= 0][0]['src'] debug_p(func_name, 'REG_HEXCODE_URL=', self.a_task.REG_HEXCODE_URL) hexch_js_url = soup.find('script', src=re.compile( self.a_task.REG_HEXCODE_URL)).get('src', '') debug_p(func_name, 'hexch_js_url=', hexch_js_url, 'ts=', time.time()-exec_ts+0.1) hexch_js_code = requests.get(hexch_js_url, verify=False) hexch_js_code.encoding = 'utf8' hexch_js_code = hexch_js_code.text # insert 'return ...' into hexch_js_code # pattern = re.compile(r'(?<=[A-Z]\.ajax_get\().*?(?=,)') pattern = re.compile(r'(?<=T\.ajax_get\().*?(?=,)') ajax_url = pattern.search(hexch_js_code).group(0).replace('AJAX_URL', reserve_url_prefix) debug_p(func_name, 'ajax_url=', ajax_url, 'ts=', time.time()-exec_ts+0.1) # hexch_js_code = re.sub(r'[A-Z]\.ajax_get', 'return %s ; T.ajax_get' % ajax_url, hexch_js_code) hexch_js_code = re.sub(r'T\.ajax_get', 'return %s ; T.ajax_get' % ajax_url, hexch_js_code) # candidate_seat_crdnt = [(seat_num, coordinate), (), ...] for seat_num, cordinate in candidate_seat_crdnt: self.tmp_trace_dct['libid'] = libid self.tmp_trace_dct['clssrm'] = clssrm self.tmp_trace_dct['seat_num'] = seat_num self.tmp_trace_dct['cordinate'] = cordinate self.tmp_trace_dct['code'] = '' self.tmp_trace_dct['msg'] = '没有合适的' if requests_time_limit <= 0: break ### test time.sleep(3) # exe hexch_js_code tmp = execjs.compile(hexch_js_code) http_hexch_seatinfo = tmp.call('reserve_seat', libid, cordinate) debug_p(func_name, 'http_hexch_seatinfo=', http_hexch_seatinfo, 'ts=', time.time()-exec_ts+0.1) # debug_p(func_name, 'cookies=', self.a_task.M_COOKIES) # if need verify code , try times = 1 try_times_limit = 1 # check if need verify code vc_code = '' while True: time.sleep(3) # reserve a seat requests_time_limit -= 1 # exec_time t = time.time() + 0.1 # '1564905302.6147149' millisecond = str(str(t).split('.')[-1])[:3] exe_time = str(time.strftime("%Y.%m.%d_%H:%M:%S", time.localtime(t))) + '.' + millisecond debug_p(func_name, 'request, tmp_trace_dct=', self.tmp_trace_dct) # response = requests.get(http_hexch_seatinfo, proxies=utils.get_proxy(), headers=self.a_task.M_HEADERS, cookies=self.a_task.M_COOKIES, verify=False) response = self.sess.get(http_hexch_seatinfo + vc_code, proxies=utils.get_proxy(), headers=self.a_task.M_HEADERS, cookies=self.a_task.M_COOKIES, verify=False) # response.encoding = 'utf8' debug_p(func_name, 'reserve response=', response.text[:300]) # type(code) = int code, msg = self.parse_response(response=response) self.tmp_trace_dct['code'] = code self.tmp_trace_dct['msg'] = msg if code != 1000: # self.trace_dct_ls += [{'libid': libid, 'clssrm': clssrm, 'seat_num': seat_num, 'cordinate': cordinate, # 'exe_time': exe_time, 'code': code, 'msg': msg}] break elif code == 1000 and try_times_limit > 0: try_times_limit -= 1 # need vc code vc_code = self.get_verifycode(imageUrl=self.a_task.CURRENT_URL['verifycode_page']) # self.feedback += '验证码为:' + str(vc_code) + '' + '\n' # self.trace_dct_ls[-1]['msg'] += '验证码为:' + str(vc_code) + '' + '\n' # msg += '验证码为:' + str(vc_code) + '' + '\n' else: # break # success or fialed but completed completed_flag = self.check_msg(self.tmp_trace_dct.get('msg', '没有合适的')) self.tmp_trace_dct['completed_flag'] = completed_flag # deep copy self.trace_dct_ls += [dict(self.tmp_trace_dct.items())] # refresh self.tmp_trace_dct = {} # self.trace_dct_ls[-1]['completed_flag'] = int(completed_flag) if code == 0 or completed_flag: # completed, task done, discard second candidate seat return True # normal done, failed reserve a seat, completed_flag = 'continue' # failed and try reserve next candidate seat, # if 'seat_num' not in self.tmp_trace_dct: self.tmp_trace_dct['clssrm'] = self.tmp_trace_dct.get('clssrm', '没有合适的') self.tmp_trace_dct['seat_num'] = self.tmp_trace_dct.get('seat_num', '没有合适的') self.tmp_trace_dct['completed_flag'] = self.tmp_trace_dct.get('completed_flag', 'continue') # task continue return False
def run(url): valid_url = check_if_valid_url(url) if not valid_url: print("=> Invalid URL, must start with http://www.\n") else: print("=> URL is valid\n") drivers = [] for session_num in range(0, len(PROXIES) if USE_PROXIES else NUM_SESSIONS): service_args = [] if USE_PROXIES: proxy, proxy_auth = get_proxy(session_num) service_args = [ '--proxy={}'.format(proxy), '--proxy-type=http', '--ignore-ssl-errors=true', ] if proxy_auth: service_args.append('--proxy-auth={}'.format(proxy_auth)) logging.debug(service_args) user_agent = get_user_agent(session_num) desired_capabilities = dict(DesiredCapabilities.PHANTOMJS) desired_capabilities['phantomjs.page.settings.userAgent'] = user_agent desired_capabilities['phantomjs.page.customHeaders.User-Agent'] = user_agent desired_capabilities['phantomjs.page.customHeaders.customHeaders'] = \ {'Accept': 'text/html', 'Content-type': 'text/html', 'Cache-Control': 'max-age=0'} driver = webdriver.PhantomJS(executable_path=PHANTOM_JS_LOCATION, service_args=service_args) driver.set_page_load_timeout(30) drivers.append(driver) for driver in drivers: load_session(driver, url) # Now the product pages are loaded, we are just gonna check if the hmac cookie is set on one of our session # If true we want to transfer the session to a Chrome browser to let you check-out opened_drivers = [] while True: logging.info("Checking for hmac in all session cookies.. [10s interval]") for session_num, driver in enumerate(drivers): if check_if_past_queue(driver) and driver not in opened_drivers: opened_drivers.append(driver) logging.info("[HMAC] Cookie found on session {}".format(driver.session_id)) # New thread open browser user_agent = get_user_agent(session_num) if USE_PROXIES: proxy, proxy_auth = get_proxy(session_num) else: proxy = None proxy_auth = None threading.Thread(target=transfer_session, kwargs={ 'driver': driver, 'proxy': proxy, 'proxy_auth': proxy_auth, 'user_agent': user_agent }).start() time.sleep(10)
def main(): logger = utils.get_logger() global zones global industrys proxy = utils.get_proxy() if not proxy: logger.info('did not get proxy, quit!!!') return # proxy = None job_file = open('keys_number', 'r') jobtitles = job_file.readlines() job_file.close() apply_origin_task = False origin_task = {"city": "101010100", "zone": "%E6%9C%9D%E9%98%B3%E5%8C%BA", "money": 5, "jobtitle": "170501", "education": 205} for city in CITY_LIST: logger.info('---------------------------------------------------------') # time.sleep(10) if apply_origin_task and 'city' in origin_task: if city[0] != origin_task['city']: continue else: origin_task.pop('city') if not origin_task: apply_origin_task = False continue print apply_origin_task, origin_task, city process_dict = {'city': city[0], 'cityName': city[1]} list_result = get_list(page_now=30, is_get_zone=True, proxy=proxy, **process_dict) logger.info('1================'+str(list_result)) # time.sleep(5) if (len(list_result['jobs'])<=14) and (not apply_origin_task): task_file = open('task_file', 'a') task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n') task_file.close() continue for zone in list_result['zone']: if apply_origin_task and 'zone' in origin_task: if zone[0] != origin_task['zone']: continue else: origin_task.pop('zone') if not origin_task: apply_origin_task = False continue else: process_dict = {'city': city[0], 'zone': zone[0], 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')} list_result = get_list(page_now=30, proxy=proxy, **process_dict) logger.info('2================'+str(list_result)) # time.sleep(5) if len(list_result['jobs'])<=14: task_file = open('task_file', 'a') task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n') task_file.close() continue for jobtitle in jobtitles: if apply_origin_task and 'jobtitle' in origin_task: if jobtitle.split()[0] != origin_task['jobtitle']: continue else: origin_task.pop('jobtitle') if not origin_task: apply_origin_task = False continue else: process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')} list_result = get_list(page_now=30, proxy=proxy, **process_dict) logger.info('3================'+str(list_result)) # time.sleep(5) if len(list_result['jobs'])<=14: task_file = open('task_file', 'a') task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n') task_file.close() continue for money in MONEY_LIST: if apply_origin_task and 'money' in origin_task: if money != origin_task['money']: continue else: origin_task.pop('money') if not origin_task: apply_origin_task = False continue else: process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'money': money, 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')} list_result = get_list(page_now=30, proxy=proxy, **process_dict) logger.info('4================'+str(list_result)) # time.sleep(5) if len(list_result['jobs'])<=14: task_file = open('task_file', 'a') task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n') task_file.close() continue for education in EDUCATION_LIST: if apply_origin_task and 'education' in origin_task: if education != origin_task['education']: continue else: origin_task.pop('education') if not origin_task: apply_origin_task = False continue else: process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'money': money, 'education': education, 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')} list_result = get_list(page_now=30, proxy=proxy, **process_dict) logger.info('5================'+str(list_result)) # time.sleep(5) if len(list_result['jobs'])<=14: task_file = open('task_file', 'a') task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n') task_file.close() continue for experience in EXPERIENCE_LIST: if apply_origin_task and 'experience' in origin_task: if experience != origin_task['experience']: continue else: origin_task.pop('experience') if not origin_task: apply_origin_task = False continue else: process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'money': money, 'education': education, 'experience': experience, 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')} list_result = get_list(page_now=30, proxy=proxy, **process_dict) logger.info('6================'+str(list_result)) # time.sleep(5) if len(list_result['jobs'])<=14: task_file = open('task_file', 'a') task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n') task_file.close() continue for size in SIZE_LIST: if apply_origin_task and 'size' in origin_task: if size != origin_task['size']: continue else: origin_task.pop('size') if not origin_task: apply_origin_task = False continue else: process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'money': money, 'education': education, 'experience': experience, 'size': size, 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')} task_file = open('task_file', 'a') task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n') task_file.close()
def process(task): logger = utils.get_logger() logger.info('process jd_liepin start!!!') result = {'code': 0} redis_client = get_redis_client() task_data_list = task.get('data', []) if not task_data_list or not task_data_list[0]['executeParam']: logger.info('did not get task_data_list!!!') result['code'] = 1 return result # print task_data_list # time.sleep(50) task_data = json.loads(task_data_list[0]['executeParam']) # task_data = task_data_list[0]['executeParam'] # if set(['cityCode', 'cityName', 'funcCode', 'funcName']) - set(task_data.keys()): if set(['zone']) - set(task_data.keys()): logger.info('not get full keys:' + str(task_data.keys())) result['code'] = 2 return result logger.info('deal with ' + str(task_data)) task_data['pagenum'] = int(task_data.get('pagenum', 0)) get_next_page_tag = True # proxy = utils.get_proxy()['proxy'] proxy = utils.get_proxy() headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'www.liepin.com', #'Upgrade-Insecure-Requests':'1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', } headers = {} while get_next_page_tag: zone = task_data.get('zone', '') # industry = task_data.get('industry', '') # industry2 = task_data.get('industry2', '') money = task_data.get('money', '') compkind = task_data.get('compkind', '') jobkind = task_data.get('jobkind', '') size = task_data.get('size', '') jobtitle = task_data.get('jobTitles', '') key_word = task_data.get('key', '') dqs_param = str(zone) if zone else '' # industry_param = str(industry) if industry else '' # industry_2_param = str(industry2) if industry else '' jobTitles_param = str(jobtitle) if jobtitle else '' money_param = str(money) if money else '' compkind_param = str(compkind) if compkind else '' jobkind_param = str(jobkind) if jobkind else '' size_param = str(size) if size else '' key_word_param = str(key_word) if key_word else '' # list_url = 'https://www.liepin.com/zhaopin/?pubTime=3&fromSearchBtn=2&init=-1&industryType='+industry_param+'&industries='+industry_2_param+'&salary='+money_param+'&jobKind='+jobkind_param+'&compscale='+size_param+'&compkind='+compkind_param+'&dqs='+dqs_param+'&curPage='+str(task_data['pagenum']) # list_url = 'https://www.liepin.com/zhaopin/?pubTime=1&fromSearchBtn=2&init=-1&jobTitles='+jobTitles_param+'&salary='+money_param+'&jobKind=4&compscale='+size_param+'&compkind='+compkind_param+'&dqs='+dqs_param+'&curPage='+str(task_data['pagenum'])+'&key='+key_word_param #list_url = 'https://www.liepin.com/zhaopin/?pubTime=1&init=-1&jobTitles='+jobTitles_param+'&salary=&jobKind=&compscale=&compkind=&dqs='+dqs_param+'&searchType=1&d_pageSize=40&d_curPage='+str(task_data['pagenum'])+'&key=' list_url = 'https://www.liepin.com/zhaopin/?pubTime=1&jobTitles=' + jobTitles_param + '&searchType=1&dqs=' + dqs_param + '&industryType=&industries=&salary=&key=&d_pageSize=40&d_curPage=' + str( task_data['pagenum']) + '&&init=-1' try: for x in xrange(settings.project_settings['DOWNLOAD_RETRY_TIMES']): logger.info('start download list:' + list_url) list_result = utils.download(url=list_url, proxy=proxy, headers=headers) if not list_result['code']: if len(list_result['data']) < 1024: logger.info('get ' + list_result['data']) else: break proxy = utils.get_proxy() time.sleep(2) else: logger.info('error when download:' + list_url) result['executeParam'] = json.dumps(task_data) return result # while True: # logger.info('start download list:'+list_url) # list_result = utils.download(url=list_url, proxy=proxy, headers=headers) # if not list_result['code']: # break # proxy = utils.get_proxy() # time.sleep(2) # time.sleep(300) if list_result['code']: logger.info('get error when download list:' + str(list_result)) raise Exception else: logger.info('success when download:' + list_url) tree_root = etree.HTML(list_result['data']) # sojob_result = tree_root.xpath('//div[@class="sojob-result "]') or tree_root.xpath('//div[@class="sojob-result"]') or tree_root.xpath('//div[@class="sojob-result sojob-no-result"]') job_list = tree_root.xpath('//ul[@class="sojob-list"]/li') if not job_list: logger.info('did not get job_list, return!!!') logger.info(u'没有符合条件的职位 %s' % task_data_list[0]['executeParam']) get_next_page_tag = False break # job_list = tree_root.xpath('//div[@class="job-box"]/div[@class="job-list"]/ul/li') # next_page = tree_root.xpath('''.//a[@onclick="clickLog('from=chr_list_lowpage_next');"]''') job_count_number = 0 download_day = str(time.localtime().tm_mon) + '-' + str( time.localtime().tm_mday) for job_index, job in enumerate(job_list): if job.atrrib['class'] == 'downgrade-search': if not job_count_number: logger.info(u'没有符合条件的职位 %s' % task_data_list[0]['executeParam']) get_next_page_tag = False break try: job_info = job.xpath( './div[@class="sojob-item-main clearfix"]/div[@class="job-info"]' )[0] if job_info: job_url = 'https://www.liepin.com' + job_info.xpath( './h3/a')[0].attrib['href'] else: continue job_count_number += 1 # check if need download or not job_key = 'liepin_jd_' + job_url.split('/')[-1].split( '.')[0] has_find_in_redis = False try: job_download_time = redis_client.get(job_key) if job_download_time == download_day: has_find_in_redis = True else: redis_client.set(job_key, download_day) except Exception, e: # redis_client.set(job_key, download_day) logger.info('get error when use redis.') pass if has_find_in_redis: logger.info('has find %s in redis' % job_key) continue else: logger.info('not find %s in redis' % job_key) urgent_flag = 0 type_flag = 2 if job.xpath('.//i[@class="icon icon-red-triangle"]'): urgent_flag = 1 if job.xpath('.//i[@class="icon icon-blue-triangle"]'): type_flag = 1 if job.xpath('.//i[@class="icon icon-orange-triangle"]'): type_flag = 3 time.sleep(5) job_content = { 'content': '', 'type': type_flag, 'urgentFlag': urgent_flag, } for x in xrange( settings.project_settings['DOWNLOAD_RETRY_TIMES']): logger.info('start download job:' + job_url) job_result = utils.download(url=job_url, proxy=proxy, headers=headers) if not job_result['code']: if len(list_result['data']) < 1024: logger.info('get ' + list_result['data']) else: break proxy = utils.get_proxy() time.sleep(2) else: logger.info('error when download:' + job_url) continue # result['executeParam'] = json.dumps(task_data) # return result # while True: # logger.info('start download job:'+job_url) # job_result = utils.download(url=job_url, proxy=proxy, headers=headers) # if not job_result['code']: # break # proxy = utils.get_proxy() # time.sleep(2) if job_result['code']: logger.info('get error when download job_url:' + job_url + str(job_result)) continue else: logger.info('success when download:' + job_url) job_root = etree.HTML(job_result['data']) company_urls = job_root.xpath( '//div[@class="title-info"]/h3/a') company_info = '' if not company_urls or not company_urls[0].attrib.get( 'href', ''): logger.info('not get company_urls') else: company_url = company_urls[0].attrib['href'] for x in xrange( settings. project_settings['DOWNLOAD_RETRY_TIMES']): logger.info('start download company:' + company_url) company_result = utils.download(url=company_url, proxy=proxy, headers=headers) if not company_result['code']: if len(list_result['data']) < 1024: logger.info('get ' + list_result['data']) else: break proxy = utils.get_proxy() time.sleep(2) else: logger.info('error when download:' + company_url) continue # result['executeParam'] = json.dumps(task_data) # return result # while True: # logger.info('start download company:'+company_url) # company_result = utils.download(url=company_url, proxy=utils.get_proxy(), headers=headers) # if not company_result['code']: # break # proxy = utils.get_proxy() # time.sleep(2) if company_result['code']: logger.info( 'get error when download company_url:' + company_url + str(company_result)) else: logger.info('success when download:' + company_url) company_info = company_result['data'] job_content['content'] = job_result['data'].encode('utf8') job_str = json.dumps(job_content, ensure_ascii=False) trace_uuid = str(uuid.uuid1()) sql = 'insert into jd_raw (source, content, createBy, trackId, createtime, pageUrl, searchConditions, pageNum, pageIndex, contactInfo) values ("' + settings.project_settings[ 'SOURCE'] + '", %s, "python", %s, now(), %s, %s, %s, %s, %s)' # task_data['cityName'] = task_data['cityName'].decode('utf8') # task_data['funcName'] = task_data['funcName'].decode('utf8') sql_value = (job_str, trace_uuid, job_url, json.dumps(task_data, ensure_ascii=False), task_data['pagenum'], job_index, company_info.encode('utf8')) kafka_data = { "channelType": "WEB", "content": { "content": job_str, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": proxy, "jdUpdateTime": '', "source": settings.project_settings['SOURCE'], "trackId": '', 'contactInfo': company_info.encode('utf8'), 'searchConditions': json.dumps(task_data, ensure_ascii=False), 'pageUrl': job_url, }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": settings.project_settings['RESOURCE_TYPE'], 'protocolType': 'HTTP', "source": settings.project_settings['SOURCE'], "trackId": '', } # f=open('kafka_data', 'a') # f.write(json.dumps(kafka_data)+'\n') # f.close() # time.sleep(10) utils.save_data(sql, sql_value, kafka_data) except Exception, e: logger.info('get error when download:' + job_url + str(traceback.format_exc())) continue
def parse_next_page(cookie, page_num, max_page): post_url = "http://rd2.zhaopin.com/rdapply/resumes/apply/search?SF_1_1_38=2,9&orderBy=CreateTime" params = { "PageList2": "", "DColumn_hidden": "", "searchKeyword": "", "curSubmitRecord": "1797", "curMaxPageNum": "90", "buttonAsse": "导入测评系统", "buttonInfo": "发通知信", "SF_1_1_50": "1", "SF_1_1_51": "-1", "SF_1_1_45": "", "SF_1_1_44": "", "SF_1_1_52": "0", "SF_1_1_49": "0", "IsInvited": "0", "position_city": "[%%POSITION_CITY%%]", "DColumn_hidden": "", "deptName": "", "select_unique_id": "", "selectedResumeList": "", "PageNo": "", "PosState": "", "MinRowID": "", "MaxRowID": "2722819791", "RowsCount": "123", "PagesCount": "5", "PageType": "0", "CurrentPageNum": page_num, "Position_IDs": "[%%POSITION_IDS%%]", "Position_ID": "[%%POSITION_ID%%]", "SortType": "0", "isCmpSum": "0", "SelectIndex_Opt": "0", "Resume_count": "0", "CID": "44036673", "forwardingEmailList": "", "click_search_op_type": "-1", " X-Requested-With": "XMLHttpRequest", } headers = { "Host": "rd2.zhaopin.com", "Accept": "*/*", "Origin": "http://rd2.zhaopin.com", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML:like Gecko) Chrome/52.0.2743.116 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://rd2.zhaopin.com/RdApply/Resumes/Apply/index", "Accept-Language": "zh-CN,zh;q=0.8", "Cookie": cookie, } session = requests.session() for x in xrange(3): proxy = utils.get_proxy() page = session.post(url=post_url, headers=headers, data=params, proxies=proxy, timeout=10).content parse_list(page, page_num, max_page, cookie) return False