def parse_job(self, response): #analysis job details item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath("salary", "//*[@class='job_request']/p/span[1]/text()") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()" ) #Start from first one item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a div h2::text") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): #解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') publish_time = response.css(".publish_time::text").extract()[0].split( " ")[0].strip() item_loader.add_value("publish_time", publish_time) item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) time.sleep(3) job_item = item_loader.load_item() return job_item
def parse_job(self, response): # 解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") # 以下四个item通过span拿到,用xpath比较好写 item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") # 这里把全文html提取 item_loader.add_css("job_desc", ".job_bt div") # 这里有些地址放在<a>下面,不能直接取text。先全拿到,后面再处理 item_loader.add_css("job_addr", ".work_addr") # 注意:job_company是一个id,所以用"#"不用"." item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() # TODO: 这里不是yield? return job_item
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_css("tags", ".job_request ul li::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/h3/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/h3/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/h3/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/h3/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_address", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", ".job_company_content em::text") item_loader.add_value("crawl_time", datetime.datetime.now()) item_loader.add_value("crawl_update_time", datetime.datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): # 解析职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", ".position-label li::text") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.datetime.now()) job_item = item_loader.load_item() return job_item # def parse_start_url(self, response): # return [] # # def process_results(self, response, results): # return results
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJonItem(), response=response) item_loader.add_css('title', '.job-name::attr(title)') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('salary', '.job_request .salary::text') item_loader.add_xpath('job_city', "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath('work_years', "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath('degree_need', "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath('job_type', "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css('publish_time', '.publish_time::text') item_loader.add_css('tags', '.position-label li::text') item_loader.add_css('job_advantage', '.job-advantage p::text') item_loader.add_css('job_desc', '.job_bt div') item_loader.add_css('job_addr', '.work_addr') item_loader.add_css('company_url', '#job_company dt a::attr(href)') item_loader.add_css('company_name', '#job_company dt a img::attr(alt)') item_loader.add_value('crawl_time', datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name span::text") item_loader.add_value("url", response.url) item_loader.add_css("salary", ".salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a div h2::text") job_item = item_loader.load_item() return job_item
def parse_job(self, response): """ 解析职位信息页面 """ item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", '.job-name::attr(title)') # 这里最小最大薪资和最短最长工作年限等到item里再去处理 item_loader.add_css("salary", '.job_request .salary::text') item_loader.add_xpath("job_city", '//*[@class="job_request"]/p/span[2]/text()') item_loader.add_xpath("work_years", '//*[@class="job_request"]/p/span[3]/text()') item_loader.add_xpath("degree_need", '//*[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath("job_type", '//*[@class="job_request"]/p/span[5]/text()') item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", '.publish_time::text') item_loader.add_css("job_advantage", '.job-advantage p::text') item_loader.add_css("job_desc", '.job_bt div') # 地点的提取处理放在item里完成。 item_loader.add_css("job_address", '.work_addr') item_loader.add_css("company_name", '.job_company dt a img::attr(alt)') item_loader.add_css("company_url", '.job_company dt a::attr(href)') item_loader.add_value("crawl_time", datetime.now()) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("url", response.url) job_item = item_loader.load_item() return job_item
def parse_item(self, response): #解析拉勾网职位 lagouItemLoader = LagouJobItemLoader(item=LagouJobItem(), response=response) lagouItemLoader.add_css('title', '.job-name span::text') lagouItemLoader.add_value('url', response.url) lagouItemLoader.add_value('url_obj_id', get_md5(response.url)) lagouItemLoader.add_css('salary', '.job_request .salary ::text') lagouItemLoader.add_xpath( 'job_city', '//dd[@class="job_request"]/p/span[2]/text()') lagouItemLoader.add_xpath( 'work_years', '//dd[@class="job_request"]/p/span[3]/text()') lagouItemLoader.add_xpath( 'degree_need', '//dd[@class="job_request"]/p/span[4]/text()') lagouItemLoader.add_xpath( 'job_type', '//dd[@class="job_request"]/p/span[5]/text()') lagouItemLoader.add_css('tags', '.position-label li::text') lagouItemLoader.add_css('publish_time', '.publish_time::text') lagouItemLoader.add_css('job_advantage', '.job-advantage p::text') lagouItemLoader.add_css('job_desc', '.job_bt div') lagouItemLoader.add_css('job_addr', '.work_addr') lagouItemLoader.add_css('company_url', '#job_company a::attr(href)') lagouItemLoader.add_css('company_name', '#job_company img::attr(alt)') lagouItemLoader.add_value('craw_time', datetime.datetime.now().date()) lagouItemLoader.add_value('craw_update_time', '') lagouJobItem = lagouItemLoader.load_item() print(lagouJobItem) return lagouJobItem
def parse_job(self, response): # 解析拉勾网的职位 # i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # 尽量少写处理逻辑,关于数据的清洗放在ItemLoader函数中去做 css .class #id item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) # 参考jobbole4中的md5用法 item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", ".position-label li::text") item_loader.add_css( "publish_time", ".publish_time::text") # need to convert str and split item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # i['name'] = response.xpath('//div[@id="name"]').extract() # i['description'] = response.xpath('//div[@id="description"]').extract() item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request p span.salary::text") item_loader.add_xpath("job_city", "//dd[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//dd[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".job_request p.publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div p") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("tags", ".position-label.clearfix li::text") item_loader.add_css("company_name", ".job_company dt a img::attr(alt)") item_loader.add_css("company_url", ".job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.datetime.now()) # item_loader.add_css("crawl_update_time",".work_addr") lagou_item = item_loader.load_item() return lagou_item
def parse_job(self, response): # logger = logging.getLogger() # formatter = logging.Formatter('%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') # # file_hadler = logging.FileHandler(filename=LOG_FILE, encoding='utf-8') # file_hadler.setLevel(level=logging.DEBUG) # file_hadler.setFormatter(formatter) # # stream_handler = logging.StreamHandler() # stream_handler.setLevel(logging.DEBUG) # stream_handler.setFormatter(formatter) # # logger.addHandler(file_hadler) # logger.addHandler(stream_handler) # # logger.info(response.url) itemloader = LagouJobItemLoader(item=LagouJobItem(), response=response) itemloader.add_css("title", ".job-name::attr(title)") itemloader.add_value("url", response.url) itemloader.add_value('url_object_id', get_md5(response.url)) itemloader.add_css("salary", ".job_request .salary::text") itemloader.add_xpath("job_city", "//*[@class='job_request']/h3/span[2]/text()") itemloader.add_xpath("work_years", "//*[@class='job_request']/h3/span[3]/text()") itemloader.add_xpath("degree_need", "//*[@class='job_request']/h3/span[4]/text()") itemloader.add_xpath("job_type", "//*[@class='job_request']/h3/span[5]/text()") itemloader.add_css("tags", '.position-label li::text') itemloader.add_css('publish_time', '.publish_time::text') itemloader.add_css('job_advantage', '.job-advantage p::text') itemloader.add_css('job_desc', '.job_bt div') itemloader.add_css('job_addr', '.work_addr') itemloader.add_css('company_name', '#job_company dt a img::attr(alt)') itemloader.add_css('company_url', '#job_company dt a::attr(href)') itemloader.add_value('crawl_time', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) job_item = itemloader.load_item() return job_item
def parse(self, response): itemloader = LagouJobItemLoader(item=LagouJobItem(), response=response) itemloader.add_css("title", ".job-name::attr(title)") itemloader.add_value("url", response.url) itemloader.add_value('url_object_id', get_md5(response.url)) itemloader.add_css("salary", ".job_request .salary::text") itemloader.add_xpath("job_city", "//*[@class='job_request']/h3/span[2]/text()") itemloader.add_xpath("work_years", "//*[@class='job_request']/h3/span[3]/text()") itemloader.add_xpath("degree_need", "//*[@class='job_request']/h3/span[4]/text()") itemloader.add_xpath("job_type", "//*[@class='job_request']/h3/span[5]/text()") itemloader.add_css("tags", '.position-label li::text') itemloader.add_css('publish_time', '.publish_time::text') itemloader.add_css('job_advantage', '.job-advantage p::text') itemloader.add_css('job_desc', '.job_bt div') itemloader.add_css('job_addr', '.work_addr') itemloader.add_css('company_name', '#job_company dt a img::attr(alt)') itemloader.add_css('company_url', '#job_company dt a::attr(href)') itemloader.add_value('crawl_time', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) job_item = itemloader.load_item() return job_item