def parse_detail_utils_zhaopin(self, response, value): contain_key_word = response.xpath( "//div[@class='main1 cl main1-stat']//h1/text()").extract_first() m = re.search(value, contain_key_word, re.IGNORECASE) if m: itemloader = Job51ItemLoader(item=Job51Item(), response=response) itemloader.add_value("url", response.url) itemloader.add_value("url_obj_id", get_md5(response.url)) itemloader.add_value("title", contain_key_word) str_salary = response.xpath( "//div[@class='l info-money']/strong/text()").extract_first("") if '元/月' in str_salary: list_str = str_salary.split("-") salary_min = float(list_str[0]) salary_max = float(list_str[1].strip().split("元")[0].strip()) itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) elif '面议' in str_salary: salary_min = 0.0 salary_max = 0.0 itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) job_city = response.xpath( "//div[@class='info-three l']/span/a/text()").extract_first("") itemloader.add_value("job_city", job_city) experience_year = response.xpath( "//div[@class='info-three l']/span[2]/text()").extract_first("") itemloader.add_value("experience_year", experience_year) education_need = response.xpath( "//div[@class='info-three l']/span[3]/text()").extract_first("") itemloader.add_value("education_need", education_need) itemloader.add_value("publish_date", datetime.now()) job_advantage_tags_list = response.xpath( "//div[@class='welfare']//ul//li/text()").extract() if len(job_advantage_tags_list) == 0: job_advantage_tags = " " else: job_advantage_tags = ','.join(job_advantage_tags_list) position_info_contains_job_request_list = response.xpath( "//div[@class='responsibility pos-common']//text()").extract() if len(position_info_contains_job_request_list) == 0: position_info_contains_job_request = " " else: position_info_contains_job_request = ','.join( position_info_contains_job_request_list) itemloader.add_value("job_advantage_tags", job_advantage_tags) itemloader.add_value("position_info", position_info_contains_job_request) itemloader.add_value("job_classification", "未分类") itemloader.add_value("crawl_time", datetime.now()) item = itemloader.load_item() return item
def parse_detail(self, response): if response.status == 200: value = 'java' dict_obj = json.loads(response.text) if dict_obj.get("data", ''): code = dict_obj['code'] numFound = dict_obj['data']['numFound'] # print(type((json.loads(response.text))['data']['numFound'])) # print(type(response.meta['meta_data'])) list_len = len(dict_obj['data']['results']) if code == 200 and numFound > 0: for i in range(list_len): contain_key_word = dict_obj['data']['results'][i][ 'jobName'] m = re.search(value, contain_key_word, re.IGNORECASE) if m: itemloader = Job51ItemLoader(item=Job51Item(), response=response) itemloader.add_value( "url", dict_obj['data']['results'][i]['positionURL']) itemloader.add_value( "url_obj_id", get_md5(response.url) + str(int(time.time()))) itemloader.add_value("title", contain_key_word) str_salary = dict_obj['data']['results'][i][ 'salary'] if 'K' in str_salary: try: list_str = str_salary.split("-") salary_min = float(list_str[0].strip( ).split("K")[0].strip()) * 1000 salary_max = float(list_str[1].strip( ).split("K")[0].strip()) * 1000 itemloader.add_value( "salary_min", salary_min) itemloader.add_value( "salary_max", salary_max) except Exception as e: print('error str_salary', str_salary) print(e) else: print('str_salary error', str_salary) itemloader.add_value("salary_min", 0) itemloader.add_value("salary_max", 0) job_city = dict_obj['data']['results'][i]['city'][ 'display'] itemloader.add_value("job_city", job_city) experience_year = dict_obj['data']['results'][i][ 'workingExp']['name'] print('experience_year', experience_year) itemloader.add_value("experience_year", experience_year) education_need = dict_obj['data']['results'][i][ 'eduLevel']['name'] print('education_need', education_need) itemloader.add_value("education_need", education_need) job_advantage_tags_list = dict_obj['data'][ 'results'][i]['welfare'] if len(job_advantage_tags_list) == 0: job_advantage_tags = " " else: job_advantage_tags = ','.join( job_advantage_tags_list) position_info_contains_job_request_list = self.get_position_info_contains_job_request_list( dict_obj['data']['results'][i]['positionURL']) if len(position_info_contains_job_request_list ) == 0: print('error response url', response.url) print('error item 第几个', i) position_info_contains_job_request = " " else: position_info_contains_job_request = ','.join( position_info_contains_job_request_list) itemloader.add_value("job_advantage_tags", job_advantage_tags) itemloader.add_value( "position_info", position_info_contains_job_request) itemloader.add_value( "job_classification", dict_obj['data'] ['results'][i]['jobType']['display']) itemloader.add_value("crawl_time", datetime.now()) publish_date = dict_obj['data']['results'][i][ 'createDate'].strip().split( " ")[0].strip() + "" itemloader.add_value("publish_date", publish_date) item = itemloader.load_item() yield item if response.status == 200 and response.meta.get( 'meta_data', '') < (json.loads(response.text))['data']['numFound'] - 60: meta_data = 60 + response.meta.get('meta_data', '') print('start', meta_data) page = response.meta.get('page', '') + 1 print('current_page', page) dic_page = {"p": 1, "jl": "489", "kw": "java", "kt": "3"} dic_page['p'] = page data = '{0}'.format(dic_page) print(data) from urllib import parse url_data = parse.quote(string=data, encoding="utf-8") url_next = 'https://fe-api.zhaopin.com/c/i/sou?start={start}&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=java&kt=3&lastUrlQuery={lastUrlQuery}'.format( start=meta_data, lastUrlQuery=url_data) yield Request(url=url_next, callback=self.parse_detail, meta={ 'meta_data': meta_data, 'page': page })
def parse_detail_utils(self, response, value): contain_key_word = response.xpath("//div[@class='tHeader tHjob']//h1/text()").extract_first().strip() m = re.search(value, contain_key_word, re.IGNORECASE) if m: itemloader = Job51ItemLoader(item=Job51Item(), response=response) itemloader.add_value("url", response.url) itemloader.add_value("url_obj_id", get_md5(response.url)+str(int(time.time()))) itemloader.add_value("title", contain_key_word) try: if response.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()").extract_first("") != "": str_salary = response.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()").extract_first( "") if '千/月' in str_salary: list_str = str_salary.split("-") print(list_str[0]) print(list_str[1].strip().split("千")[0].strip()) salary_min = float(list_str[0]) * 1000 salary_max = float(list_str[1].strip().split("千")[0].strip()) * 1000 itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) elif '万/月' in str_salary: list_str = str_salary.strip().split("-") print(list_str[0]) print(list_str[1].strip().split("万")[0].strip()) salary_min = float(list_str[0]) * 10000 salary_max = float(list_str[1].strip().split("万")[0].strip()) * 10000 itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) elif '万/年' in str_salary: list_str = str_salary.strip().split("-") salary_min = float(list_str[0]) * 10000 / 12 salary_max = float(list_str[1].strip().split("万")[0].strip()) * 10000 / 12 itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) else: itemloader.add_value("salary_min", 0) itemloader.add_value("salary_max", 0) else: itemloader.add_value("salary_min", 0) itemloader.add_value("salary_max", 0) except Exception as e: print("str_salary error") print(e) itemloader.add_value("salary_min", 0) itemloader.add_value("salary_max", 0) info = response.xpath("//p[@class='msg ltype']/@title").extract_first() job_city = info.strip().split("|")[0].strip() experience_year = find_in_list(self, key="经验", list_name=info) itemloader.add_value("job_city", job_city) itemloader.add_value("experience_year", experience_year) try: education_need = info.strip().split("|")[2].strip() print(education_need) if '人' in education_need: education_need = "无" itemloader.add_value("education_need", education_need) except Exception as e: print("education_need error null") print(e) publish_date = find_in_list(self, key="发布", list_name=info) itemloader.add_value("publish_date", publish_date) job_advantage_tags_list = response.xpath("//div[@class='t1']//span/text()").extract() if len(job_advantage_tags_list) == 0: job_advantage_tags = " " else: job_advantage_tags = ','.join(job_advantage_tags_list) position_info_contains_job_request_list = response.xpath( "//div[@class='bmsg job_msg inbox']/p//text()").extract() if len(position_info_contains_job_request_list) == 0: position_info_contains_job_request = " " else: position_info_contains_job_request = ','.join(position_info_contains_job_request_list) itemloader.add_value("job_advantage_tags", job_advantage_tags) itemloader.add_value("position_info", position_info_contains_job_request) job_classification = response.xpath("//div[@class='tCompany_main']//div[@class='mt10']/p[1]//a/text()").extract_first("") itemloader.add_value("job_classification", job_classification) itemloader.add_value("crawl_time", datetime.now()) item = itemloader.load_item() return item