def parse_job(self, response): # 解析拉勾网的职位 item_loader = ArticleItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']//span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']//span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']//span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']//span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_details(self, response): #实例化 article_item = JobBoleArticleItem() front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # 通过item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() # 已经填充好了值调用yield传输至pipeline yield article_item
def parse_detail(self, response): #make an instance call article_item article_item = JobBoleArticleItem() #send to the pipeline.py, let it receive it #using item loader to load the items #getting all item via loadering item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # assgin the loader into the item article_item = item_loader.load_item() yield article_item
def parse_content(self,response): item_loader = ArticleItemLoader(item=DouBanItem(), response=response) item_loader.add_value("url",response.url) item_loader.add_xpath("title","//div[@id='content']/h1/span[1]/text()") #item_loader.add_xpath("time","//div[@id='content']/h1/span[2]/text()") item_loader.add_xpath("director","//div[@id='info']/span[1]/span[2]/a/text()") #item_loader.add_xpath("area","//*[@id='info']/text()[8]") #item_loader.add_xpath("language","//*[@id='info']/text()[10]") item_loader.add_css("score","div.rating_self strong::text") item_loader.add_xpath("introduction","//span[@property='v:summary']/text()") item_loader.add_xpath("front_image_url","//*[@id='mainpic']/a/img/@src") infos=response.xpath("//*[@id='info']/text()").extract() info_list=[] for info in infos: match_re = re.match(self.info_rule, info.strip()) if match_re: info_list.append(match_re.group(1)) time=response.xpath("//div[@id='content']/h1/span[2]/text()").extract()[0] match_re = re.match(self.time_rule, time) if match_re: item_loader.add_value("time",match_re.group(1)) item_loader.add_value("area",info_list[0]) item_loader.add_value("language",info_list[1]) item_loader.add_value("nickname",info_list[2]) douban_item = item_loader.load_item() yield douban_item
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item question_id = response.meta.get('question_id', '') item_loader = ArticleItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment span::text") item_loader.add_css( "watch_user_num", ".NumberBoard:first-child .NumberBoard-itemValue::text") item_loader.add_css( "click_num", ".NumberBoard:last-child .NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() # 请求答案api answer_url = self.start_answer_url.format(question_id, 5, 0) yield scrapy.Request(answer_url, headers=self.headers, callback=self.parse_answer) yield question_item
def parse_detail(self, response): # 实例化 article_item = JobBoleArticleItem() # 通过item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) if response.url.find("/Index/newslist") or response.url.find("/index"): # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".article-title::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "#date-topic::text") item_loader.add_css("content", ".article-content") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() else: item_loader.add_css("title", "title::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "title::text") item_loader.add_css("content", "body::text") # 已经填充好了值调用yield传输至pipeline yield article_item
def parse_detail(self, response): article_item=JobBoleArticleItem() # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") #文章封面图 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_question(self, response): question_loader = ArticleItemLoader(item=V2exQuItem(), response=response) question_loader.add_xpath("title", "//div[@class='header']/h1/text()") content="" mar_content = response.xpath("//div[@class='markdown_body']").extract() if len(mar_content)==0: content="".join(response.xpath("//div[@class='topic_content']").extract()).replace("\n","") else: content="".join(mar_content).replace("\n","") match_re1 = re.match(self.content_rule, content) if match_re1: question_loader.add_value("content",match_re1.group(1)) comment_count=response.xpath("//div[@class='cell']/span[@class='gray']/text()").extract() if len(comment_count)==0: question_loader.add_value("comment_count",0) else: match_re2 = re.match(self.comment_rule, comment_count[0]) if match_re2: question_loader.add_value("comment_count", match_re2.group(1)) question_loader.add_value("user_id",random.randint(2,14)) question_loader.add_value("created_date",time.time()) question_item=question_loader.load_item() yield question_item pass
def parse_detail(self, response): """ 提取文章的具体字段 :type response: HtmlResponse :param response: :return: """ # 通过Item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', response.meta.get("front_image_url", "")) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item # 传递到pipelines.py
def parse_detail(self, response): try: # 使用Crawl api记录文章详情页请求成功的Request self.crawler.stats.inc_value("ArticleDetail_Success_Reqeust") except Exception as e: _ = e article_item = JobBoleArticleItem() #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_list(self, response): # 实例化项目 post_url = response.xpath('//*[@id="fmimg"]/img/@src').extract_first( "") # 图片 front_image_url = parse.urljoin(response.url, post_url) list = response.xpath('//*[@id="list"]/dl/dd/a/@href').extract() # 列表 # 通过自定义ArticleItemLoader加载item item_loader = ArticleItemLoader(item=BiQuGeListItem(), response=response) # xpath 获取方式 item_loader.add_value("url_object_id", get_md5(response.url)) # MD5ID item_loader.add_xpath("title", '//*[@id="info"]/h1/text()') # 文章标题 item_loader.add_xpath("author", '//*[@id="info"]/p[1]/text()') # 作者 item_loader.add_xpath("last_update_time", '//*[@id="info"]/p[3]/text()') # 最后更新时间 item_loader.add_value("front_image_url", [front_image_url]) # 图片下载链接 article_item = item_loader.load_item() yield article_item # 循环爬取详情页 if list: post_urls = [] for each in list: # post_urls.append(parse.urljoin(response.url, each)) yield Request( url=parse.urljoin(response.url, each), meta={"url_object_id": article_item['url_object_id']}, callback=self.parse_details) pass
def parse_detail(selfs, response): article_item = CnblogsArticleItem() # title = response.css("#cb_post_title_url::text").extract()[0] # create_date = response.css('#post-date::text').extract()[0] # author = response.css('.postDesc a::text').extract()[0] # # 动态生成的,暂时爬取不了 # view_count = response.css('#post_view_count::text').extract()[0] # comment_count = response.css('#post_comment_count::text').extract()[0] # # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["author"] = author # article_item["url_object_id"] = get_md5(response.url) # 通过item loader加载item item_loader = ArticleItemLoader(item=CnblogsArticleItem(), response=response) item_loader.add_css("title", '#cb_post_title_url::text') item_loader.add_css('create_date', '#post-date::text') item_loader.add_css('author', '.postDesc a::text') item_loader.add_value('url', response.url) article_item = item_loader.load_item() yield article_item
def parse_job(self, response): LaGouArticleItem = ArticleItemLoader(item=LaGouItem(), response=response) LaGouArticleItem.add_css("job_name", '.job-name::attr(title)') LaGouArticleItem.add_css("salary", ".salary::text") LaGouArticleItem.add_xpath( "job_exp", "//dd[@class='job_request']/p/span[3]/text()") LaGouArticleItem.add_xpath( "edu", "//dd[@class='job_request']/p/span[4]/text()") LaGouArticleItem.add_xpath( "job_type", "//dd[@class='job_request']/p/span[5]/text()") LaGouArticleItem.add_xpath( "work_city", "//dd[@class='job_request']/p/span[2]/text()") LaGouArticleItem.add_css("company_name", "#job_company .b2::attr(alt)") LaGouArticleItem.add_css("company_url", ".job_company dt a::attr(href)") LaGouArticleItem.add_css("work_addr", ".work_addr") #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()") LaGouArticleItem.add_css("create_date", ".publish_time::text") LaGouArticleItem.add_value("job_url", response.url) LaGouArticleItem.add_value("job_url_id", get_md5(response.url)) LaGouArticleItem.add_css("job_advantage", ".job-advantage p::text") LaGouArticleItem.add_css("job_desc", ".job_bt div") LaGouArticleItem.add_css("tag", ".position-label li") ArticleItemLoder = LaGouArticleItem.load_item() return ArticleItemLoder
def parse_detail(self, response): # 获取每一个文章的所有内容//*[@id="post-113659"]/div[1]/h1 article_item = ArticleItem() str1 = response.url pattern = re.compile(r'>.*?(\d+).*<') res = re.findall(pattern, str1) url = str1 title = response.xpath( "//div[@class='entry-header']/h1/text()").extract()[0] time = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()" ).extract()[0].strip().replace("·", "") redianzang = response.css( ".register-user-only h10::text").extract_first() if redianzang == None: redianzang = 0 redianzang = ('%s%s' % (redianzang, '点赞')) shoucang = response.xpath( "//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] pinglun = response.xpath( "//a[@href='#article-comment']/span/text()").extract()[0] context = response.xpath("//div[@class='entry']").extract()[0] item_loader = ArticleItemLoader(item=ArticleItem(), response=response) item_loader.add_value('title', title) item_loader.add_value('times', time) item_loader.add_value('redianzang', redianzang) item_loader.add_value('shoucang', shoucang) item_loader.add_value('pinglun', pinglun) item_loader.add_value('context', context) item_loader.add_value('url', response.url) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobboleArticleItem() #通过css选择器提取数据 front_image_url = response.meta.get('front_image_url', "") # 获取文章封面图 #通过ItemLoader加载Item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", '.entry-header>h1::text') item_loader.add_value("url", response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) if response.css('.vote-post-up>h10::text'): item_loader.add_css("praise_number", '.vote-post-up>h10::text') else: item_loader.add_value("praise_number", "0") item_loader.add_css("comment_nums", 'a[href="#article-comment"]>span::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile>a::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = LvChaSoftItem() # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=LvChaSoftItem(), response=response) item_loader.add_xpath("title", "//div[@id='soft_title']/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("type", "//*[@id='main1k']/div[3]/a[3]/text()") item_loader.add_xpath("size", "//em[@id='ljdx']/text()") item_loader.add_xpath( "update_time", "//*[@id='main1k']/div[4]/div[2]/div[2]/div[1]/p[6]/em/text()") item_loader.add_xpath("content", "//*[@class='rjjsbox']/p/text()") item_loader.add_xpath("tag", "//*[@class='fllist clearfix']/p[4]/em/text()") item_loader.add_xpath("fav_nums", "//*[@class='fllist clearfix']/p[5]/em/@class") item_loader.add_xpath( "download_urls", "//*[@class='clearfix count_down']/dd/a[1]/@href") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 提取文章信息 """ # 通过自定义的item_loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath( "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath( "praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( "fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", "//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_detail(self,response): # article_item = JobBoleArticleItem() # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.css('div.entry-header h1::text').extract_first() # create_data = response.css('p.entry-meta-hide-on-mobile::text').extract_first().strip().replace("·","").strip() # praise_nums = response.css('span.vote-post-up h10::text').extract_first() # fav_nums = response.css(".bookmark-btn::text").extract_first() # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first() # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css('div.entry').extract_first() # tag_list = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date() # except Exception as e: # create_data = datetime.datetime.now().date() # article_item['create_date'] = create_data # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() # 已经填充好了值调用yield传输至pipeline yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() #提取文章的具体字段 # front_image_url = response.meta.get('front_image_url','') #文章封面图 # title = response.css('.entry-header h1::text').extract_first('') # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·','').strip() # praise_nums = response.css('.vote-post-up h10::text').extract()[0] # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match('.*?(\d+).*',fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # taglist = response.css("p.entry-meta-hide-on-mobile a::text").extract() # taglist = [element for element in taglist if not element.strip().endswith('评论')] # tags = ','.join(taglist) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] =comment_nums # article_item['content'] = content # article_item['fav_nums'] =fav_nums # article_item['tags'] =tags #通过item_loader加载item front_image_url = response.meta.get('front_image_url', '') #文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item=JobBoleArticleItem() # front_image_url=response.meta.get("front_image_url","") # title=response.css('.entry-header h1::text').extract() # date=response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip() # content=response.css("div.entry").extract()[0] # # match_obj= re.match(".*项目",title) # # if match_obj: # # print(match_obj.group(0)) # article_item["url_object_id"]=get_md5(response.url) # article_item["title"]=title # article_item["url"]=response.url # article_item["date"]=date # article_item["front_image_url"]=[front_image_url] # article_item["content"]=content front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("date", "p.entry-meta-hide-on-mobile::text") item_loader.add_css("content", "div.entry") item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("url", response.url) item_loader.add_value("front_image_url", [front_image_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 爬取数据 """ # front_image_url = response.meta.get("font_image_url", "") # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first("") # 标题 # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip() # 日期 # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # 点赞数 # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # 收藏人数 # match_re = re.match(".*?(\d+).*", fav_nums) # # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # 评论人数 # match_re = re.match(".*?(\d+).*]", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.xpath("//div[@class='entry']").extract()[0] # 日期 # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list= [element for element in tag_list if not element.strip().endswith("评论")] # tags = ','.join(tag_list) # 标签 # # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.strftime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.now().date() # # article_item["url_object_id"] = get_md5(response.url) # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content article_item = JobboleArticleItem() # 通过itemload加载item front_image_url = response.meta.get("font_image_url", "") item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath("create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath("content", "//div[@class='entry']") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") article_item= item_loader.load_item() yield article_item
def parse_detail(self, response): """通过css选择器提取页面内容""" article_item = JobboleArticleItem() # title = response.css(".entry-header h1::text").extract()[0] # front_img_url = response.meta.get("font_img_url", "") # create_date = response.css(".entry-meta p::text").extract()[0].replace("·", "").strip() # praise_nums = response.css(".post-adds span h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0].strip() # match_re = re.match(".*?(\d+).*?", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*?", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css(".entry").extract()[0] # tag_list = response.css(".entry-meta a::text").extract() # tags = ','.join(tag_list) # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d") # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['title'] = title # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) # article_item['front_img_url'] = [front_img_url] # article_item['create_date'] = create_date # article_item['praise_nums'] = praise_nums # article_item['fav_nums'] = fav_nums # article_item['comment_nums'] = comment_nums # article_item['content'] = content # article_item['tags'] = tags # 通过 itemloader 加载item front_img_url = response.meta.get("font_img_url", "") item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) # item_loader = ItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_date', '.entry-meta p::text') item_loader.add_css('praise_nums', '.post-adds span h10::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('comment_nums', 'a[href="#article-comment"] span::text') item_loader.add_css('tags', '.entry-meta p a::text') item_loader.add_css('content', '.entry') # item_loader.add_xpath() item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('front_img_url', [front_img_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item=ArticleItem() # #提取文章的具体字段 # # 通过css选择器提取文章的具体字段 # front_image_url=response.meta.get("front_image_url","")#文章封面图 # title=response.css(".entry-header h1::text").extract()[0] # create_date=response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() # praise_nums=int(response.css(".vote-post-up h10::text").extract()[0]) # fav_nums=response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums=0 # comment_nums=response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums=0 # content=response.css("div.entry").extract()[0] # tag_list=response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["title"]=title # article_item["url"]=response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["create_date"]=create_date # try: # create_date=datetime.datetime.strptime(create_date,"%Y/%m/%d").date() # except Exception as e: # create_date=datetime.datetime.now().date() # article_item["front_image_url"]=[front_image_url] # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["praise_nums"] = praise_nums # article_item["tags"] = tags # article_item["content"] = content #使用ItemLoader front_image_url = response.meta.get("front_image_url", "") #文章封面图 item_loader = ArticleItemLoader(item=ArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") #item_loader.add_xpath() item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse(self, response): res_json = json.loads(response.text)[0]["data"] item_loader = ArticleItemLoader(item=PDFItem(), response=response) url_list = [ "http://reportdocs.static.szse.cn/UpFiles/fxklwxhj/CDD00079356200.pdf" ] item_loader.add_value("file_urls", url_list) pdf_item = item_loader.load_item() yield pdf_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = ArticlespiderItem() # title = response.css("#news_title a::text").extract_first("") # # title = response.xpath("//*[@id='news_title'//a/text()") # create_time = response.css("#news_info .time::text").extract_first("") # # create_date = response.xpath("//*[@id='news_info'//*[@class='time']/text()") # match_re = re.match(".*?(\d+.*)", create_time) # if match_re: # create_time = match_re.group(1) # content = response.css("#news_content").extract()[0] # # content = response.xpath("//*[@id='news_content']").extract()[0] # tag_list = response.css(".news_tags a::text").extract() # 无法存储list # # tag_list = response.xpath("//*[@class=news_tags']//a/text()") # tags = ",".join(tag_list) # post_id = match_re.group(1) # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # 尽量不用同步的库 # 打断点看是否符合要求 # url路径 / 可以避免加入到子路径 # j_data = json.loads(html.text) # article_item['title'] = title # article_item['create_time'] = create_time # article_item['content'] = content # article_item['tags'] = tags # # article_item['url'] = response.url # if response.meta.get("front_image_ur;", ""): # article_item['front_image_url'] = [response.meta.get('front_image_url', "")] # else: # article_item['front_image_url'] = [] # 使用itemloader的代码,使得程序可以更加易于维护 匹配项 item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("create_time", "#news_info .time::text") item_loader.add_value("url", response.url) if response.meta.get('front_image_url', []): item_loader.add_value('front_image_url', response.meta.get('front_image_url', [])) article_item = item_loader.load_item() if response.meta.get("front_image_ur;", ""): article_item['front_image_url'] = [response.meta.get('front_image_url', "")] else: article_item['front_image_url'] = [] yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": article_item}, callback=self.parse_nums) # praise_nums = j_data['DiggCount'] # fav_nums = j_data['TotalView'] # comment_nums = j_data['CommentCount'] pass
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体字段 front_image_url = response.meta.get("front_image_url", "") # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].replace('·', '').strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re1 = re.match(".*?(\d+).*", fav_nums) # if match_re1: # fav_nums = int(match_re1.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re2 = re.match(".*?(\d+).*", comment_nums) # if match_re2: # comment_nums = int(match_re2.group(1)) # else: # comment_nums = 0 # content = response.css("div.entry").extract()[0] # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过item loader 加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") # item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): person_nodes = response.css(".ge2_content tr") for person_node in person_nodes: person = person_node.css("td::text").extract() item_loader = ArticleItemLoader(item=GZZBItem(), response=response) if person: item_loader.add_value("code", person[0]) item_loader.add_value("name", person[1]) item_loader.add_value("date", "2017-05") gzzb_item = item_loader.load_item() yield gzzb_item
def parse_detail(self, response): # article_item = JobboleArticleItem() # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '') # fav_nums = response.xpath('//div[@class="post-adds"]/span[2]/h10/text()').extract_first() # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # content = response.xpath('//div[@class="entry"]').extract()[0] # tag_list = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过ItemLoader加载item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath('comment_nums', '//a[@href="#article-comment"]/span/text()') fav_nums = response.xpath( '//div[@class="post-adds"]/span[2]/h10/text()').extract_first() if fav_nums is None: fav_nums = '0' item_loader.add_value('fav_nums', fav_nums) item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()') item_loader.add_xpath('content', '//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # xpath 选取字段 # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].replace('·', '').strip() # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # vote_post_up = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # # bookmark_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first() # if bookmark_nums: # match_re = re.match(".*(\d+).*", bookmark_nums) # if match_re: # bookmark = int(match_re.group(1)) # # comment_nums = response.xpath("//a[@href='#article-comment']/text()").extract_first() # if comment_nums: # match_re = re.match(".*(\d+).*", comment_nums) # if match_re: # comment = int(match_re.group(1)) # css 选取字段 # article_item = JobBoleArticleItem() # article_item['title'] = title # article_item['create_date'] = create_date # article_item['front_image_url'] = [response.meta['front_image_url']] # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) #通过 ItemLoader 加载 Item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_xpath( "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") # item_loader.add_xpath("vote_post_up", "//span[contains(@class, 'vote-post-up')]/h10/text()") # item_loader.add_xpath("bookmark_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_css("content", "div.entry") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", response.meta.get("front_image_url", "")) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobbboleItem() if (response.url == 'http://blog.jobbole.com/all-posts/'): pass # url = response.url # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract()[0] # create_date = response.xpath('//*[@class="entry-meta"]/p/text()').extract()[0].strip().replace("·","").strip() # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]) # book_mark = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0] # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(r".*(\d+).*",book_mark) # match_com = re.match(r".*(\d).*",comment_nums) # content = response.xpath("//*[@class='entry']/p/text()").extract()[0] # if match_re: # book_mark = int(match_re.group(1)) # else: # book_mark = 0 # if match_com: # comment_nums = int(match_com.group(1)) # else: # comment_nums = 0 # article_item['title'] = title # """ # 将字符串日期格式化为date类型 # """ # try: # create_date = datetime.datetime.strftime(create_date,"%Y/%m/%d").date() # except Exception as er: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['praise_nums'] = praise_nums # article_item['book_mark'] = book_mark # article_item['comment_nums'] = comment_nums # article_item['content'] = content # article_item['url'] = url #通过itemLoader加载 item item_loader = ArticleItemLoader(item=JobbboleItem(), response=response) item_loader.add_xpath("title", '//*[@class="entry-header"]/h1/text()') item_loader.add_value("url", response.url) item_loader.add_xpath("create_date", '//*[@class="entry-meta"]/p/text()') item_loader.add_xpath( "praise_nums", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath( "book_mark", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("content", "//*[@class='entry']/p/text()") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item = JobBoleArticleItem() item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) front_image_url = response.meta.get("front_image_url", "") item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item