def parse_item(self, response): item_loader = LagouJobItemLoader(LagouJobItem(), response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) item_loader.add_value("crawl_update_time", datetime.now()) jobItem = item_loader.load_item() return jobItem
def parse_detail(self, response): article_item = ItcodemonkeyArticleItem() # 提取文章的具体字段 item_loader = JobBoleArticleItemLoader(item=ItcodemonkeyArticleItem(), response=response) # 针对css选择器 item_loader.add_css( 'title', 'body > div.container.tc-main > div.row > div.span9 > div > h2::text' ) item_loader.add_css( 'create_time', 'body > div.container.tc-main > div.row > div.span9 > div > div.article-infobox > span::text' ) item_loader.add_css( 'classify', 'body > div.container.tc-main > div.row > div.span9 > div > div.article-infobox > span > a::text' ) item_loader.add_css('content', '#article_content') # 针对直接取值的情况 item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_data = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first("").replace("·", # "").strip() # # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first("") # # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first("") # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath("//div[@class='entry']").extract_first("") # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # 通过css选择器提取字段 # front_image_url = response.meta.get("front_image_url", "") # title = response.css(".entry-header h1::text").extract_first("") # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first("").replace("·", "").strip() # # praise_nums = response.css(".vote-post-up h10::text").extract_first("") # # fav_nums = response.css("span.bookmark-btn::text").extract_first("") # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("") # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract_first("") # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): #xpath 提取方法,过于繁琐,建议使用ItemLoader # ArticleItem = JobBoleArticle() # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace('·',' ').strip() # praise_nums = int(response.xpath('//div[@class="post-adds"]/span/h10/text()').extract()[0]) # # fav_nums = re.match(r'.*(\d+).*',response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]) # if fav_nums: # fav_nums = int(fav_nums.group(1)) # else: # fav_nums = 0 # comment_nums = re.match(r'.*(\d+).*', response.xpath('//a[contains(@href,"comment")]/span/text()').extract()[0]) # if comment_nums: # comment_nums = int(comment_nums.group(1)) # else: # comment_nums = 0 # content = response.xpath('//div[@class="entry"]/p/text()').extract(); # tag_list =response.xpath('//p[contains(@class,"hide-on-mobile")]/a/text()').extract(); # tag_list = [t for t in tag_list if not t.strip().endswith("评论")] # tags= ",".join(tag_list) # author = response.xpath('//div[@class="copyright-area"]/a/text()').extract_first() # # # ArticleItem["title"] = title # try: # create_time = datetime.datetime.strptime(create_time,'%Y/%m/%d').date() # except Exception as e : # create_time = datetime.datetime.now() # ArticleItem["create_time"] = create_time # ArticleItem["praise_nums"] = praise_nums # ArticleItem["fav_nums"] = fav_nums # ArticleItem["comment_nums"] = comment_nums # ArticleItem["content"] = content # ArticleItem["tags"] = tags # ArticleItem["url_object_id"] = get_md5(response.url) # ArticleItem["image_url"] = [front_image_url] # ArticleItem["author"] = author # ArticleItem["url"] = response.url front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticle(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_xpath( 'create_time', '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_xpath('praise_nums', '//div[@class="post-adds"]/span/h10/text()') item_loader.add_xpath( 'fav_nums', '//span[contains(@class,"bookmark-btn")]/text()') item_loader.add_xpath('comment_nums', '//a[contains(@href,"comment")]/span/text()') item_loader.add_xpath('content', '//div[@class="entry"]//p/text()') item_loader.add_xpath( 'tags', '//p[contains(@class,"hide-on-mobile")]/a/text()') item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('image_url', [front_image_url]) item_loader.add_xpath('author', '//div[@class="copyright-area"]/a/text()') item_loader.add_value('url', response.url) ArticleItem = item_loader.load_item() yield ArticleItem
def to_md5url(value): """将urlMD5""" value = common.get_md5(value) return value