def parse_detail_utils_zhaopin(self, response, value): contain_key_word = response.xpath( "//div[@class='main1 cl main1-stat']//h1/text()").extract_first() m = re.search(value, contain_key_word, re.IGNORECASE) if m: itemloader = Job51ItemLoader(item=Job51Item(), response=response) itemloader.add_value("url", response.url) itemloader.add_value("url_obj_id", get_md5(response.url)) itemloader.add_value("title", contain_key_word) str_salary = response.xpath( "//div[@class='l info-money']/strong/text()").extract_first("") if '元/月' in str_salary: list_str = str_salary.split("-") salary_min = float(list_str[0]) salary_max = float(list_str[1].strip().split("元")[0].strip()) itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) elif '面议' in str_salary: salary_min = 0.0 salary_max = 0.0 itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) job_city = response.xpath( "//div[@class='info-three l']/span/a/text()").extract_first("") itemloader.add_value("job_city", job_city) experience_year = response.xpath( "//div[@class='info-three l']/span[2]/text()").extract_first("") itemloader.add_value("experience_year", experience_year) education_need = response.xpath( "//div[@class='info-three l']/span[3]/text()").extract_first("") itemloader.add_value("education_need", education_need) itemloader.add_value("publish_date", datetime.now()) job_advantage_tags_list = response.xpath( "//div[@class='welfare']//ul//li/text()").extract() if len(job_advantage_tags_list) == 0: job_advantage_tags = " " else: job_advantage_tags = ','.join(job_advantage_tags_list) position_info_contains_job_request_list = response.xpath( "//div[@class='responsibility pos-common']//text()").extract() if len(position_info_contains_job_request_list) == 0: position_info_contains_job_request = " " else: position_info_contains_job_request = ','.join( position_info_contains_job_request_list) itemloader.add_value("job_advantage_tags", job_advantage_tags) itemloader.add_value("position_info", position_info_contains_job_request) itemloader.add_value("job_classification", "未分类") itemloader.add_value("crawl_time", datetime.now()) item = itemloader.load_item() return item
def parse_detail(self, response): if response.status == 200: value = 'java' dict_obj = json.loads(response.text) if dict_obj.get("data", ''): code = dict_obj['code'] numFound = dict_obj['data']['numFound'] # print(type((json.loads(response.text))['data']['numFound'])) # print(type(response.meta['meta_data'])) list_len = len(dict_obj['data']['results']) if code == 200 and numFound > 0: for i in range(list_len): contain_key_word = dict_obj['data']['results'][i][ 'jobName'] m = re.search(value, contain_key_word, re.IGNORECASE) if m: itemloader = Job51ItemLoader(item=Job51Item(), response=response) itemloader.add_value( "url", dict_obj['data']['results'][i]['positionURL']) itemloader.add_value( "url_obj_id", get_md5(response.url) + str(int(time.time()))) itemloader.add_value("title", contain_key_word) str_salary = dict_obj['data']['results'][i][ 'salary'] if 'K' in str_salary: try: list_str = str_salary.split("-") salary_min = float(list_str[0].strip( ).split("K")[0].strip()) * 1000 salary_max = float(list_str[1].strip( ).split("K")[0].strip()) * 1000 itemloader.add_value( "salary_min", salary_min) itemloader.add_value( "salary_max", salary_max) except Exception as e: print('error str_salary', str_salary) print(e) else: print('str_salary error', str_salary) itemloader.add_value("salary_min", 0) itemloader.add_value("salary_max", 0) job_city = dict_obj['data']['results'][i]['city'][ 'display'] itemloader.add_value("job_city", job_city) experience_year = dict_obj['data']['results'][i][ 'workingExp']['name'] print('experience_year', experience_year) itemloader.add_value("experience_year", experience_year) education_need = dict_obj['data']['results'][i][ 'eduLevel']['name'] print('education_need', education_need) itemloader.add_value("education_need", education_need) job_advantage_tags_list = dict_obj['data'][ 'results'][i]['welfare'] if len(job_advantage_tags_list) == 0: job_advantage_tags = " " else: job_advantage_tags = ','.join( job_advantage_tags_list) position_info_contains_job_request_list = self.get_position_info_contains_job_request_list( dict_obj['data']['results'][i]['positionURL']) if len(position_info_contains_job_request_list ) == 0: print('error response url', response.url) print('error item 第几个', i) position_info_contains_job_request = " " else: position_info_contains_job_request = ','.join( position_info_contains_job_request_list) itemloader.add_value("job_advantage_tags", job_advantage_tags) itemloader.add_value( "position_info", position_info_contains_job_request) itemloader.add_value( "job_classification", dict_obj['data'] ['results'][i]['jobType']['display']) itemloader.add_value("crawl_time", datetime.now()) publish_date = dict_obj['data']['results'][i][ 'createDate'].strip().split( " ")[0].strip() + "" itemloader.add_value("publish_date", publish_date) item = itemloader.load_item() yield item if response.status == 200 and response.meta.get( 'meta_data', '') < (json.loads(response.text))['data']['numFound'] - 60: meta_data = 60 + response.meta.get('meta_data', '') print('start', meta_data) page = response.meta.get('page', '') + 1 print('current_page', page) dic_page = {"p": 1, "jl": "489", "kw": "java", "kt": "3"} dic_page['p'] = page data = '{0}'.format(dic_page) print(data) from urllib import parse url_data = parse.quote(string=data, encoding="utf-8") url_next = 'https://fe-api.zhaopin.com/c/i/sou?start={start}&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=java&kt=3&lastUrlQuery={lastUrlQuery}'.format( start=meta_data, lastUrlQuery=url_data) yield Request(url=url_next, callback=self.parse_detail, meta={ 'meta_data': meta_data, 'page': page })
def parse_job(self, response): global global_result if response.status == 302: print("302") print(response.url) try: time.sleep(1) src = response.xpath( "//img[@id='captcha']/@src").extract_first("") if src: print('src:', src) img_src = "https://www.lagou.com" + src try: image = Image.open( BytesIO((requests.get(img_src)).content)) image.save('verify2.gif') rcf = RClientFour(self.ruokuai_username, self.ruokuai_passwd) image = open('verify2.gif', 'rb').read() global_result = rcf.rk_create_code(image, 3040).get('Result') print('result:', global_result) except IOError: print('*****检查自己的快豆是不是没了****') pass # time.sleep(100000) browser = webdriver.Chrome( executable_path="/home/wqh/下载/chromedriver") browser.find_element_by_xpath("//*[@id='code']").send_keys( global_result) browser.find_element_by_xpath("//a[@id='submit']").click() return except Exception as e: print(e) print('不是验证页面') pass title = response.xpath( "/html/body/div[2]/div/div[1]/div/span").extract_first("") item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) list_type = [] flag = False m = re.search("java", title, re.IGNORECASE) if m: flag = True list_type.append("java") if re.search("python", title, re.IGNORECASE): flag = True list_type.append("python") if re.search("人工智能", title, re.IGNORECASE): flag = True list_type.append("人工智能") if re.search("算法", title, re.IGNORECASE): flag = True list_type.append("算法") if re.search("大数据", title, re.IGNORECASE): flag = True list_type.append("大数据") if re.search("C\+\+", title, re.IGNORECASE): flag = True list_type.append("C++") if re.search("go", title, re.IGNORECASE): flag = True list_type.append("go") if flag: # 解析拉勾网的职位 item_loader.add_value("type", list_type) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value( "url_obj_id", get_md5(response.url) + str(int(time.time()))) str_salary = response.xpath( "//span[@class='salary']/text()").extract_first("") if 'k' in str_salary: try: list_str = str_salary.split("-") salary_min = float( list_str[0].strip().split("k")[0].strip()) * 1000 salary_max = float( list_str[1].strip().split("k")[0].strip()) * 1000 item_loader.add_value("salary_min", salary_min) item_loader.add_value("salary_max", salary_max) except Exception as e: print('error str_salary', str_salary) print(e) else: print('str_salary error', str_salary) item_loader.add_value("salary_min", 0) item_loader.add_value("salary_max", 0) # item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath( "job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath( "experience_year", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath( "education_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath( "job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_value("job_classification", title) item_loader.add_css("publish_date", ".publish_time::text") item_loader.add_css("job_advantage_tags", ".job-advantage p::text") item_loader.add_css("position_info", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): title = response.xpath("/html/body/div[2]/div/div[1]/div/span").extract_first("") item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) list_type = [] flag = False m = re.search("java", title, re.IGNORECASE) if m: flag = True list_type.append("java") if re.search("python", title, re.IGNORECASE): flag = True list_type.append("python") if re.search("人工智能", title, re.IGNORECASE): flag = True list_type.append("人工智能") if re.search("算法", title, re.IGNORECASE): flag = True list_type.append("算法") if re.search("大数据", title, re.IGNORECASE): flag = True list_type.append("大数据") if re.search("C\+\+", title, re.IGNORECASE): flag = True list_type.append("C++") if re.search("go", title, re.IGNORECASE): flag = True list_type.append("go") if flag: #解析拉勾网的职位 item_loader.add_value("type", list_type) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_obj_id", get_md5(response.url)+str(int(time.time()))) str_salary = response.xpath("//span[@class='salary']/text()").extract_first("") if 'k' in str_salary: try: list_str = str_salary.split("-") salary_min = float(list_str[0].strip().split("k")[0].strip()) * 1000 salary_max = float(list_str[1].strip().split("k")[0].strip()) * 1000 item_loader.add_value("salary_min", salary_min) item_loader.add_value("salary_max", salary_max) except Exception as e: print('error str_salary', str_salary) print(e) else: print('str_salary error', str_salary) item_loader.add_value("salary_min", 0) item_loader.add_value("salary_max", 0) # item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("experience_year", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("education_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") try: item_loader.add_css("job_classification", '.position-label li::text') except Exception as e: print("job_classification error") print(e) item_loader.add_value("job_classification", '.job-name::attr(title)') item_loader.add_css("publish_date", ".publish_time::text") item_loader.add_css("job_advantage_tags", ".job-advantage p::text") item_loader.add_css("position_info", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_detail_utils(self, response, value): contain_key_word = response.xpath( "//div[@class='tHeader tHjob']//h1/text()").extract_first().strip() m = re.search(value, contain_key_word, re.IGNORECASE) if m: itemloader = Job51ItemLoader(item=Job51Item(), response=response) itemloader.add_value("url", response.url) itemloader.add_value("url_obj_id", get_md5(response.url) + str(int(time.time()))) itemloader.add_value("title", contain_key_word) try: if response.xpath( "/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()" ).extract_first("") != "": str_salary = response.xpath( "/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()" ).extract_first("") if '千/月' in str_salary: list_str = str_salary.split("-") print(list_str[0]) print(list_str[1].strip().split("千")[0].strip()) salary_min = float(list_str[0]) * 1000 salary_max = float( list_str[1].strip().split("千")[0].strip()) * 1000 itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) elif '万/月' in str_salary: list_str = str_salary.strip().split("-") print(list_str[0]) print(list_str[1].strip().split("万")[0].strip()) salary_min = float(list_str[0]) * 10000 salary_max = float( list_str[1].strip().split("万")[0].strip()) * 10000 itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) elif '万/年' in str_salary: list_str = str_salary.strip().split("-") salary_min = float(list_str[0]) * 10000 / 12 salary_max = float( list_str[1].strip().split("万")[0].strip()) * 10000 / 12 itemloader.add_value("salary_min", salary_min) itemloader.add_value("salary_max", salary_max) else: itemloader.add_value("salary_min", 0) itemloader.add_value("salary_max", 0) else: itemloader.add_value("salary_min", 0) itemloader.add_value("salary_max", 0) except Exception as e: print("str_salary error") print(e) itemloader.add_value("salary_min", 0) itemloader.add_value("salary_max", 0) info = response.xpath("//p[@class='msg ltype']/@title").extract_first() job_city = info.strip().split("|")[0].strip() experience_year = find_in_list(self, key="经验", list_name=info) itemloader.add_value("job_city", job_city) itemloader.add_value("experience_year", experience_year) try: education_need = info.strip().split("|")[2].strip() print(education_need) if '人' in education_need: education_need = "无" itemloader.add_value("education_need", education_need) except Exception as e: print("education_need error null") print(e) publish_date = find_in_list(self, key="发布", list_name=info) itemloader.add_value("publish_date", publish_date) job_advantage_tags_list = response.xpath( "//div[@class='t1']//span/text()").extract() if len(job_advantage_tags_list) == 0: job_advantage_tags = " " else: job_advantage_tags = ','.join(job_advantage_tags_list) position_info_contains_job_request_list = response.xpath( "//div[@class='bmsg job_msg inbox']/p//text()").extract() if len(position_info_contains_job_request_list) == 0: position_info_contains_job_request = " " else: position_info_contains_job_request = ','.join( position_info_contains_job_request_list) itemloader.add_value("job_advantage_tags", job_advantage_tags) itemloader.add_value("position_info", position_info_contains_job_request) job_classification = response.xpath( "//div[@class='tCompany_main']//div[@class='mt10']/p[1]//a/text()" ).extract_first("") itemloader.add_value("job_classification", job_classification) itemloader.add_value("crawl_time", datetime.now()) item = itemloader.load_item() return item