Esempio n. 1
0
def get_html(url, retry, track_id, getproxies_):
    if not url.startswith('https'):
        url = "https:" + url
    """
    获取列表页面
    :return: 异常返回null
    """
    if retry <= 0:
        return None
    logger.info('开始连接: %s , retry= %s' % (url, retry))
    if project_settings.get('useAby'):
        getproxies_ = project_settings.get('aby')
    proxy_ip = None
    if getproxies_:
        proxy_ip = find('.*?(\d+\.\d+\.\d+\.\d+:\d+)', getproxies_.get("http"))

    result = utils.download(url=url,
                            headers=get_info_headers(),
                            proxy=getproxies_,
                            encoding='gbk')
    if result['code'] != 0:
        logger.error("连接页面异常,track_id= %s ,重试: retry= %s" % (track_id, retry))
        # getproxies_.update(utils.get_proxy())
        return get_html(url, retry - 1, track_id, getproxies_)
    elif '用户数不够' in result['data'] \
            or '在线用户数超过' in result['data'] \
            or len(result['data']) < 1000:
        logger.error("代理异常,track_id= %s ,重试: retry= %s" % (track_id, retry))
        # getproxies_.update(utils.get_proxy())
        return get_html(url, retry - 1, track_id, getproxies_)
    return [result['data'], proxy_ip]
Esempio n. 2
0
def parse_url(base_url, page_num):
    logger = utils.get_logger()
    url = None
    # u0表示不限 u1表示最近三天,默认取最小搜索时间3天
    u = 1
    if project_settings.get('U'):
        u = project_settings.get("U")
    if base_url and page_num:
        url = base_url + '/u' + str(u) + 'o' + str(page_num)
        logger.info('当前分页的URL为:%s' % url)
    return url
Esempio n. 3
0
def process(task):
    """
        根据一个搜索条件开始一项搜索
        :return:
        """
    global logger
    if project_settings.get('useAby'):
        getproxies_ = project_settings.get('aby')
    else:
        getproxies_ = utils.get_proxy()
    logger = utils.get_logger()
    param_dict = json.loads(task['data'][0]['executeParam'], encoding="utf-8")

    result = {'code': 0}
    track_id = str(uuid.uuid1())

    page_num = 1
    if param_dict['page_num']:
        page_num = param_dict['page_num']
    while True:
        url = get_list_url(param_dict, page_num)

        list_html_list = get_html(url, 5, track_id, getproxies_)
        if list_html_list:
            logger.info("list_html success when download: " + url)
            info_list = parse_list_html(list_html_list[0], track_id, page_num)
        else:
            # 页面不正常
            logger.error(u"列表页面获取失败: url=%s" % url)
            param_dict['page_num'] = page_num
            result['executeResult'] = 'list_html_error'
            result['executeParam'] = json.dumps(param_dict,
                                                ensure_ascii=False).encode()
            result['code'] = 1
            return result

        if 'none_jd' == info_list:
            # 抓取完了
            logger.info("此搜索条件无新职位可用: url=%s" % url)
            logger.info('没有符合条件的职位 %s' % json.dumps(param_dict))
            result['executeResult'] = u'正常完毕'
            return result
        else:
            for info in info_list:
                try:
                    info_mian(param_dict, info, track_id, getproxies_)
                except Exception, e:
                    logger.error(traceback.extract_stack())

        page_num += 1
Esempio n. 4
0
def build_page_url(data=None, page_num=None):
    logger = utils.get_logger()
    if not page_num:
        page_num = 1
    city_url = data['cityUrl']
    func_code = data['funcCode']
    # pd=3 表示最近三天,pd=1 表示搜索当天,默认搜素最近三天
    pd = 3
    if project_settings.get('PD'):
        pd = project_settings.get('PD')
    if 'https://jobs.zhaopin.com/' in city_url:
        return city_url + 'sj' + str(func_code) + '/pd' + str(pd) + '/p' + str(
            page_num)
    else:
        return city_url + '&isfilter=1&pd=' + str(pd) + '&p=' + str(
            page_num) + '&sj=' + str(func_code)
Esempio n. 5
0
def download_page(url=None, method=None, header=None, refer=None, proxy=None):
    logger = utils.get_logger()
    result = {}
    # if not header:
    header = {
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent':
        'Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
        'Accept':
        'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cookie': 'ZP_OLD_FLAG=true;'
    }
    if refer:
        header['Referer'] = refer
    for x in xrange(0, 3):
        # proxy = utils.get_proxy()
        if project_settings.get('useAby'):
            proxy = project_settings.get('aby')
        else:
            proxy = utils.get_proxy()
        logger.info('download_page : %s ' % url)

        result = utils.download(url=url,
                                headers=header,
                                method=method,
                                allow_redirects=True,
                                retry_time=1,
                                proxy=proxy)
        print result
        if result['code'] == 0:
            logger.info('success when download %s-%s ' % (proxy, url))
            break
        time.sleep(1)

    result['proxy'] = ''
    return result
Esempio n. 6
0
def parse_list(data):
    logger = utils.get_logger()
    # url = data['url']
    city_url = data['cityUrl']
    page_num = data['pageNum']
    flg = True
    while flg:
        url = build_page_url(data=data, page_num=page_num)
        logger.info('请求列表页 url : %s' % (url, ))
        if project_settings.get('useAby'):
            proxy = project_settings.get('aby')
        else:
            proxy = utils.get_proxy()
        results = download_page(url=url, method='get', proxy=proxy)
        proxy = results['proxy']
        content = results['data']
        if '暂时无符合您条件的职位' in content or '没有符合您要求的职位' in content:
            logger.info('没有符合条件的职位 %s' % json.dumps(data, ensure_ascii=False))
            data['code'] = 200
            flg = True
            break
        if '您要访问的页面暂时没有找到' in content:
            logger.info('页面没有找到,返回404 %s ' % url)
            data['code'] = 200
            flg = True
            break
        if 'jobs.zhaopin.com' in city_url:
            flg = parse_list_v1(page=content,
                                page_num=page_num,
                                data=data,
                                refer=url,
                                proxy=proxy)
        else:
            flg = parse_list_v2(page=content,
                                page_num=page_num,
                                data=data,
                                refer=url,
                                proxy=proxy)

        # 有解析到正常数据
        logger.info('解析列表页详情数据 返回结果 %s' %
                    (json.dumps(flg, ensure_ascii=False)))
        if flg.has_key('status') and flg.get('status'):
            data['code'] = 200
            if flg.has_key('detail_count') and flg.get('detail_count') > 0:
                page_num += 1
            else:
                data['code'] = 200
                flg = False
                break
        else:
            logger.info('列表页面访问失败 %s ' % url)
            data['code'] = 500
            flg = False
            break
        # 对于职位很不错的列表页 直接跳出
        if '以下职位也很不错' in content:
            flg = False
            logger.info('含有 以下职位也很不错 跳出循环')
            data['code'] = 200
            break
    data['pageNum'] = page_num
    return data