def get_html(url, retry, track_id, getproxies_): if not url.startswith('https'): url = "https:" + url """ 获取列表页面 :return: 异常返回null """ if retry <= 0: return None logger.info('开始连接: %s , retry= %s' % (url, retry)) if project_settings.get('useAby'): getproxies_ = project_settings.get('aby') proxy_ip = None if getproxies_: proxy_ip = find('.*?(\d+\.\d+\.\d+\.\d+:\d+)', getproxies_.get("http")) result = utils.download(url=url, headers=get_info_headers(), proxy=getproxies_, encoding='gbk') if result['code'] != 0: logger.error("连接页面异常,track_id= %s ,重试: retry= %s" % (track_id, retry)) # getproxies_.update(utils.get_proxy()) return get_html(url, retry - 1, track_id, getproxies_) elif '用户数不够' in result['data'] \ or '在线用户数超过' in result['data'] \ or len(result['data']) < 1000: logger.error("代理异常,track_id= %s ,重试: retry= %s" % (track_id, retry)) # getproxies_.update(utils.get_proxy()) return get_html(url, retry - 1, track_id, getproxies_) return [result['data'], proxy_ip]
def parse_url(base_url, page_num): logger = utils.get_logger() url = None # u0表示不限 u1表示最近三天,默认取最小搜索时间3天 u = 1 if project_settings.get('U'): u = project_settings.get("U") if base_url and page_num: url = base_url + '/u' + str(u) + 'o' + str(page_num) logger.info('当前分页的URL为:%s' % url) return url
def process(task): """ 根据一个搜索条件开始一项搜索 :return: """ global logger if project_settings.get('useAby'): getproxies_ = project_settings.get('aby') else: getproxies_ = utils.get_proxy() logger = utils.get_logger() param_dict = json.loads(task['data'][0]['executeParam'], encoding="utf-8") result = {'code': 0} track_id = str(uuid.uuid1()) page_num = 1 if param_dict['page_num']: page_num = param_dict['page_num'] while True: url = get_list_url(param_dict, page_num) list_html_list = get_html(url, 5, track_id, getproxies_) if list_html_list: logger.info("list_html success when download: " + url) info_list = parse_list_html(list_html_list[0], track_id, page_num) else: # 页面不正常 logger.error(u"列表页面获取失败: url=%s" % url) param_dict['page_num'] = page_num result['executeResult'] = 'list_html_error' result['executeParam'] = json.dumps(param_dict, ensure_ascii=False).encode() result['code'] = 1 return result if 'none_jd' == info_list: # 抓取完了 logger.info("此搜索条件无新职位可用: url=%s" % url) logger.info('没有符合条件的职位 %s' % json.dumps(param_dict)) result['executeResult'] = u'正常完毕' return result else: for info in info_list: try: info_mian(param_dict, info, track_id, getproxies_) except Exception, e: logger.error(traceback.extract_stack()) page_num += 1
def build_page_url(data=None, page_num=None): logger = utils.get_logger() if not page_num: page_num = 1 city_url = data['cityUrl'] func_code = data['funcCode'] # pd=3 表示最近三天,pd=1 表示搜索当天,默认搜素最近三天 pd = 3 if project_settings.get('PD'): pd = project_settings.get('PD') if 'https://jobs.zhaopin.com/' in city_url: return city_url + 'sj' + str(func_code) + '/pd' + str(pd) + '/p' + str( page_num) else: return city_url + '&isfilter=1&pd=' + str(pd) + '&p=' + str( page_num) + '&sj=' + str(func_code)
def download_page(url=None, method=None, header=None, refer=None, proxy=None): logger = utils.get_logger() result = {} # if not header: header = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 'Accept': 'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cookie': 'ZP_OLD_FLAG=true;' } if refer: header['Referer'] = refer for x in xrange(0, 3): # proxy = utils.get_proxy() if project_settings.get('useAby'): proxy = project_settings.get('aby') else: proxy = utils.get_proxy() logger.info('download_page : %s ' % url) result = utils.download(url=url, headers=header, method=method, allow_redirects=True, retry_time=1, proxy=proxy) print result if result['code'] == 0: logger.info('success when download %s-%s ' % (proxy, url)) break time.sleep(1) result['proxy'] = '' return result
def parse_list(data): logger = utils.get_logger() # url = data['url'] city_url = data['cityUrl'] page_num = data['pageNum'] flg = True while flg: url = build_page_url(data=data, page_num=page_num) logger.info('请求列表页 url : %s' % (url, )) if project_settings.get('useAby'): proxy = project_settings.get('aby') else: proxy = utils.get_proxy() results = download_page(url=url, method='get', proxy=proxy) proxy = results['proxy'] content = results['data'] if '暂时无符合您条件的职位' in content or '没有符合您要求的职位' in content: logger.info('没有符合条件的职位 %s' % json.dumps(data, ensure_ascii=False)) data['code'] = 200 flg = True break if '您要访问的页面暂时没有找到' in content: logger.info('页面没有找到,返回404 %s ' % url) data['code'] = 200 flg = True break if 'jobs.zhaopin.com' in city_url: flg = parse_list_v1(page=content, page_num=page_num, data=data, refer=url, proxy=proxy) else: flg = parse_list_v2(page=content, page_num=page_num, data=data, refer=url, proxy=proxy) # 有解析到正常数据 logger.info('解析列表页详情数据 返回结果 %s' % (json.dumps(flg, ensure_ascii=False))) if flg.has_key('status') and flg.get('status'): data['code'] = 200 if flg.has_key('detail_count') and flg.get('detail_count') > 0: page_num += 1 else: data['code'] = 200 flg = False break else: logger.info('列表页面访问失败 %s ' % url) data['code'] = 500 flg = False break # 对于职位很不错的列表页 直接跳出 if '以下职位也很不错' in content: flg = False logger.info('含有 以下职位也很不错 跳出循环') data['code'] = 200 break data['pageNum'] = page_num return data