def get_all_city_company_count(cls): """ 获取所有拉勾上的城市 的公司数目 :return: {'city_id': total_company_count, ....} """ citys = CityModel.list() city_company_counts_dict = {} for city in citys: headers = generate_http_header() # proxies = {"https": redis_instance.srandmember(constants.REDIS_PROXY_KEY).decode()} url = constants.CITY_COMPANY_URL.format(city=city.id, finance_stage=0, industry=0) prams = { 'first': False, 'pn': 1, 'sortField': 1, 'havemark': 0, } response = requests.get(url=url, params=prams, headers=headers, timeout=constants.TIMEOUT).json() city_company_counts_dict[city.id] = int(response['totalCount']) time.sleep(constants.MIN_SLEEP_TIME) return city_company_counts_dict
def request_jobs_count_json(city, keyword): query_string = {'needAddtionalResult': False} if city != '全国': query_string['city'] = city form_data = { 'first': False, 'pn': 1, 'kd': keyword.name } headers = generate_http_header(is_crawl_jobs_count=True) crawler_sleep() try: cookies = Cookies.get_random_cookies() response = requests.post(url=constants.JOB_JSON_URL, params=query_string, data=form_data, headers=headers, cookies=cookies, allow_redirects=False, timeout=constants.TIMEOUT) response_json = response.json() if 'content' not in response_json: Cookies.remove_cookies(cookies) raise RequestsError(error_log='wrong response content') except RequestException as e: logging.error(e) raise RequestsError(error_log=e) return response_json
def requests_job_detail_data(job_id): """请求职位详情页数据""" headers = generate_http_header() crawler_sleep() try: response = requests.get( url=constants.JOB_DETAIL_URL.format(job_id=job_id), headers=headers, cookies=Cookies.get_random_cookies(), allow_redirects=False, timeout=constants.TIMEOUT) except RequestException as e: logging.error(e) raise RequestsError(error_log=e) html = etree.HTML(response.text) department = html.xpath( '//div[@class="job-name"]/div[@class="company"]/text()') description = html.xpath('//dd[@class="job_bt"]/div//text()') keywords = html.xpath( '//dd[@class="job_request"]//li[@class="labels"]/text()') return format_tag(department, description, keywords, job_id)
def requests_company_detail_data(company_id): """请求公司详情页数据""" headers = generate_http_header() crawler_sleep() try: response = requests.get( url=constants.COMPANY_DETAIL_URL.format(company_id=company_id), headers=headers, cookies=Cookies.get_random_cookies(), allow_redirects=False, timeout=constants.TIMEOUT) except RequestException as e: logging.error(e) raise RequestsError(error_log=e) html = etree.HTML(response.text) advantage = html.xpath('//div[@id="tags_container"]//li/text()') size = html.xpath('//div[@id="basic_container"]//li[3]/span/text()') address = html.xpath('//p[@class="mlist_li_desc"]/text()') introduce = html.xpath('//span[@class="company_content"]//text()') return format_tag(advantage, address, size, introduce, company_id)
def request_job_json(company_id, page_no): prams = { 'companyId': company_id, 'positionFirstType': u"技术", 'pageNo': page_no, 'pageSize': 10, } headers = generate_http_header() crawler_sleep() try: cookies = Cookies.get_random_cookies() response_json = requests.get(url=constants.COMPANY_JOB_URL, params=prams, headers=headers, cookies=cookies, timeout=constants.TIMEOUT).json() if 'content' not in response_json: Cookies.remove_cookies(cookies) raise RequestsError(error_log='wrong response content') except RequestException as e: logging.error(e) raise RequestsError(error_log=e) return response_json
def request_company_json(url, page_no): prams = { 'first': False, 'pn': page_no, 'sortField': 1, 'havemark': 0, } headers = generate_http_header() crawler_sleep() try: cookies = Cookies.get_random_cookies() response_json = requests.get(url=url, params=prams, headers=headers, cookies=cookies, allow_redirects=False, timeout=constants.TIMEOUT).json() if 'totalCount' not in response_json: Cookies.remove_cookies(cookies) raise RequestsError(error_log='wrong response content') except RequestException as e: logging.error(e) raise RequestsError(error_log=e) return response_json