Esempio n. 1
0
 def get_all_city_company_count(cls):
     """
     获取所有拉勾上的城市 的公司数目
     :return: {'city_id': total_company_count, ....}
     """
     citys = CityModel.list()
     city_company_counts_dict = {}
     for city in citys:
         headers = generate_http_header()
         # proxies = {"https": redis_instance.srandmember(constants.REDIS_PROXY_KEY).decode()}
         url = constants.CITY_COMPANY_URL.format(city=city.id,
                                                 finance_stage=0,
                                                 industry=0)
         prams = {
             'first': False,
             'pn': 1,
             'sortField': 1,
             'havemark': 0,
         }
         response = requests.get(url=url,
                                 params=prams,
                                 headers=headers,
                                 timeout=constants.TIMEOUT).json()
         city_company_counts_dict[city.id] = int(response['totalCount'])
         time.sleep(constants.MIN_SLEEP_TIME)
     return city_company_counts_dict
Esempio n. 2
0
def request_jobs_count_json(city, keyword):
    query_string = {'needAddtionalResult': False}
    if city != '全国':
        query_string['city'] = city
    form_data = {
        'first': False,
        'pn': 1,
        'kd': keyword.name
    }
    headers = generate_http_header(is_crawl_jobs_count=True)
    crawler_sleep()
    try:
        cookies = Cookies.get_random_cookies()
        response = requests.post(url=constants.JOB_JSON_URL,
                                 params=query_string,
                                 data=form_data,
                                 headers=headers,
                                 cookies=cookies,
                                 allow_redirects=False,
                                 timeout=constants.TIMEOUT)
        response_json = response.json()
        if 'content' not in response_json:
            Cookies.remove_cookies(cookies)
            raise RequestsError(error_log='wrong response content')
    except RequestException as e:
        logging.error(e)
        raise RequestsError(error_log=e)
    return response_json
Esempio n. 3
0
def requests_job_detail_data(job_id):
    """请求职位详情页数据"""
    headers = generate_http_header()
    crawler_sleep()
    try:
        response = requests.get(
            url=constants.JOB_DETAIL_URL.format(job_id=job_id),
            headers=headers,
            cookies=Cookies.get_random_cookies(),
            allow_redirects=False,
            timeout=constants.TIMEOUT)
    except RequestException as e:
        logging.error(e)
        raise RequestsError(error_log=e)
    html = etree.HTML(response.text)
    department = html.xpath(
        '//div[@class="job-name"]/div[@class="company"]/text()')
    description = html.xpath('//dd[@class="job_bt"]/div//text()')
    keywords = html.xpath(
        '//dd[@class="job_request"]//li[@class="labels"]/text()')
    return format_tag(department, description, keywords, job_id)
Esempio n. 4
0
def requests_company_detail_data(company_id):
    """请求公司详情页数据"""
    headers = generate_http_header()
    crawler_sleep()
    try:
        response = requests.get(
            url=constants.COMPANY_DETAIL_URL.format(company_id=company_id),
            headers=headers,
            cookies=Cookies.get_random_cookies(),
            allow_redirects=False,
            timeout=constants.TIMEOUT)
    except RequestException as e:
        logging.error(e)
        raise RequestsError(error_log=e)
    html = etree.HTML(response.text)
    advantage = html.xpath('//div[@id="tags_container"]//li/text()')
    size = html.xpath('//div[@id="basic_container"]//li[3]/span/text()')
    address = html.xpath('//p[@class="mlist_li_desc"]/text()')
    introduce = html.xpath('//span[@class="company_content"]//text()')

    return format_tag(advantage, address, size, introduce, company_id)
Esempio n. 5
0
def request_job_json(company_id, page_no):
    prams = {
        'companyId': company_id,
        'positionFirstType': u"技术",
        'pageNo': page_no,
        'pageSize': 10,
    }
    headers = generate_http_header()
    crawler_sleep()
    try:
        cookies = Cookies.get_random_cookies()
        response_json = requests.get(url=constants.COMPANY_JOB_URL,
                                     params=prams,
                                     headers=headers,
                                     cookies=cookies,
                                     timeout=constants.TIMEOUT).json()
        if 'content' not in response_json:
            Cookies.remove_cookies(cookies)
            raise RequestsError(error_log='wrong response content')
    except RequestException as e:
        logging.error(e)
        raise RequestsError(error_log=e)
    return response_json
Esempio n. 6
0
def request_company_json(url, page_no):
    prams = {
        'first': False,
        'pn': page_no,
        'sortField': 1,
        'havemark': 0,
    }
    headers = generate_http_header()
    crawler_sleep()
    try:
        cookies = Cookies.get_random_cookies()
        response_json = requests.get(url=url,
                                     params=prams,
                                     headers=headers,
                                     cookies=cookies,
                                     allow_redirects=False,
                                     timeout=constants.TIMEOUT).json()
        if 'totalCount' not in response_json:
            Cookies.remove_cookies(cookies)
            raise RequestsError(error_log='wrong response content')
    except RequestException as e:
        logging.error(e)
        raise RequestsError(error_log=e)
    return response_json