def parse_job(self, response): item_loader = ArticleItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name span::text") item_loader.add_value("url", response.url) item_loader.add_css("salary", ".salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a div h2::text") job_item = item_loader.load_item() return job_item
def get_detail_use_item_loader(self, response): ''' 使用item_loader,这里得到的字段是列表 :return: ''' article_item = JobBoleArticleItem() item_loader = ArticleItemLoader(item=article_item, response=response) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("title", "//div[@class = 'entry-header']/h1/text()") # 标题 item_loader.add_xpath("create_date", "//div[@class='entry-meta']/p/text()") item_loader.add_xpath("praise_nums", "//div[@class='post-adds']//h10/text()") # 点赞数 item_loader.add_xpath( "fav_nums", "//div[@class='post-adds']/span[2]/text()") # 收藏数 item_loader.add_xpath( "comment_nums", "//span[@class='btn-bluet-bigger href-style hide-on-480']/text()" ) # 评论数 item_loader.add_xpath("content", "//div[@class='entry']") # 内容 item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") # 内容 article_item = item_loader.load_item() yield article_item
def parse(self, response): self.brower.get(response.url) self.brower.execute_script("window.scrollTo(0,250)") for i in range(1,16): selector=self.brower.find_element_by_xpath("html/body/div[5]/div[1]/div[1]/div/ul/li[{0}]".format(i)) ActionChains(self.brower).move_to_element(selector).perform() """ 1、获取当前页面 2、获取到下一页的Url 并交给scrapy下载 下载完成后交给Parse """ selectorRepnse=Selector(text=self.brower.page_source) #self.brower.quit() items_pop = selectorRepnse.css(".cate_detail_item") for items in items_pop: #主分类 pro = items.css(".cate_detail_tit_lk") pro_name = pro.css("::text").extract_first("") pro_url = parse.urljoin(response.url, pro.css("::attr(href)").extract_first("")) links = items.css(".cate_detail_con_lk") for link in links: #次级分类 name = link.css("::text").extract_first("") url = parse.urljoin(response.url,link.css("::attr(href)").extract_first("")) #填充Item item_loader = ArticleItemLoader(item=JDIndexItem(), response=response) item_loader.add_value("index_name",str(name)) item_loader.add_value("url",str(url)) item_loader.add_value("pro_name",str(pro_name)) item_loader.add_value("pro_url",str(pro_url)) article_item = item_loader.load_item() yield article_item yield Request(url= url, meta={"name": name}, callback=self.parse_detail, dont_filter=True)
def parse_detail(self, response): self.brower.get(response.url) self.brower.execute_script("window.scrollTo(0,document.body.scrollHeight-1000)") import time time.sleep(2) selecter=Selector(text=self.brower.page_source) #解析详细 页 sort_name = response.meta.get("name", "") # 通过Itemloder加载item detail_items = selecter.css(".gl-i-wrap.j-sku-item") for detail_item in detail_items: uid = uuid.uuid4() item_loder=ArticleItemLoader(item=JDDetailItem(),response=response) url=parse.urljoin(response.url,detail_item.css("div.gl-i-wrap.j-sku-item div.p-img a::attr(href)").extract_first()) price=detail_item.css("div.p-price strong i::text").extract_first("") name=detail_item.css("div.p-name em::text").extract_first("") commit=detail_item.css("div.p-commit span.buy-score em::text").extract_first("暂无推荐指数") type=detail_item.css("div.p-icons.J-pro-icons i::text").extract_first("") shopname=detail_item.css("div.p-shop span a::text").extract_first("") item_loder.add_value("uid",uid) item_loder.add_value("url",url) item_loder.add_value("price",price) item_loder.add_value("jdname",name) item_loder.add_value("jdcommit",commit) item_loder.add_value("jdtype",type) item_loder.add_value("shopname",shopname) item_loder.add_value("sort_name",sort_name) yield item_loder.load_item() yield Request(url=url, meta={"detail_url":url,"uid":uid}, callback=self.parse_comment, dont_filter=True) next_url= parse.urljoin(response.url, response.css(".pn-next::attr(href)").extract_first("")) if next_url != "": yield Request(url=next_url, callback=self.parse_detail, meta={"name": sort_name}, dont_filter=True)
def parse_detail(self, response): #article_item=JobBoleArticleItem() # article_item['title']=response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # # # # # 提取meta中的值,使用get方法遇到空的键值才不会报错,默认值为空,此处使用的是元祖非[] # # # image的url要改为数组,不然在使用自动下载器会报错,即setting中的IMAGES_URLS_FILELD # article_item['front_image_url']=[response.meta.get('front_image_url','')] # date_time=re.match('.*?(\d{4}/\d+/\d+).*',response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]').extract()[0]) # try: # article_item['create_date']=datetime.datetime.strptime(date_time,'%Y/%m/%d').date() # except Exception as e: # article_item['create_date']=datetime.datetime.now().date() # article_item['tag']=','.join(response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()) # article_item['content']=''.join(response.xpath('//div[@class="entry"]/p/text()').extract()) # article_item['praise_nums']=response.xpath('//div[@class="post-adds"]/span/h10/text()').extract()[0] # fav_num=response.xpath('//div[@class="post-adds"]/span[2]/text()').extract()[0] # match_re=re.match(".*?(\d+).*",fav_num) # if match_re: # article_item['fav_nums']=match_re.group(1) # else: # article_item['fav_nums']=0 # comment_num=response.xpath('//div[@class="post-adds"]/a/span/text()').extract()[0] # match_re = re.match(".*?(\d+).*", comment_num) # if match_re: # article_item['comment_nums'] = match_re.group(1) # else: # article_item['comment_nums']=0 # article_item['url_object_id'] =common.get_md5(response.url) # 使用ItemLoader加载item #item_loader=ItemLoader(item=JobBoleArticleItem(),response=response) # 使用自定义ArticleItemLoader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_xpath( 'tag', '//div/p[@class="entry-meta-hide-on-mobile"]/a/text()') item_loader.add_value('front_image_url', [response.meta.get('front_image_url', '')]) item_loader.add_value('url', response.url) item_loader.add_xpath('content', '//div[@class="entry"]/p/text()') item_loader.add_xpath('praise_nums', '//div[@class="post-adds"]/span/h10/text()') item_loader.add_xpath('comment_nums', '//div[@class="post-adds"]/a/span/text()') item_loader.add_xpath('fav_nums', '//div[@class="post-adds"]/span[2]/text()') item_loader.add_xpath( 'create_date', '//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]') item_loader.add_value('url_object_id', common.get_md5(response.url)) article_item = item_loader.load_item() yield article_item pass
def parse_detail(self, response): article_item = JobboleArticleItem() # title = response.css('div.entry-header > h1::text').extract_first() # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip() # praise_num = response.css('.post-adds .vote-post-up h10::text').extract_first(0) front_img_url = response.meta.get('front_img_url', '') # # fav_num_info = response.css('.post-adds .bookmark-btn::text').extract_first() # fav_num_re = re.match(".*(\d+).*", fav_num_info) # if fav_num_re: # fav_num = fav_num_re.group(1) # else: # fav_num = 0 # comment_num_info = response.css('a[href="#article-comment"] span::text').extract_first() # comment_num_re = re.findall("\d+",comment_num_info) # if comment_num_re: # comment_num = comment_num_re[0] # else: # comment_num = 0 # # tag_list = response.css('.entry-meta .entry-meta-hide-on-mobile a::text').extract() # tags = ','.join([tag for tag in set(tag_list) if not tag.strip().endswith('评论')]) # content = response.css('.entry').extract_first() # # article_item['url_object_id'] = get_md5(response.url) # article_item['url'] = response.url # article_item['title'] = title # try: # create_date = datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.now() # article_item['create_date'] = create_date # article_item['praise_num'] = praise_num # article_item['fav_num'] = fav_num # article_item['comment_num'] = comment_num # article_item['front_img_url'] = [front_img_url] # article_item['tags'] = tags # article_item['content'] = content #通过item loader价值item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', 'div.entry-header > h1::text') item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_css('praise_num', '.post-adds .vote-post-up h10::text') item_loader.add_css('fav_num', '.post-adds .bookmark-btn::text') #re item_loader.add_css('comment_num', 'a[href="#article-comment"] span::text') #re item_loader.add_css( 'tag', '.entry-meta .entry-meta-hide-on-mobile a::text') #处理函数 item_loader.add_css('content', '.entry') item_loader.add_value('front_img_url', [front_img_url]) item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) article_item = item_loader.load_item() yield article_item
def parse_content(self, response): # 通过css选择器提取数据 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css('.entry-header h1::text').extract_first() # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace("·","").strip() # praise_num = response.css('.vote-post-up h10::text').extract_first() #点赞数 # fav_num = response.css('.bookmark-btn::text').extract_first() #收藏数 # match_re = re.match(".*?(\d+).*", fav_num) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # comments_num = response.css('a[href="#article-comment"] span::text').extract_first() # 评论数 # match_re = re.match(".*?(\d+).*", comments_num) # 正则获取字符串中的数字 # if match_re: # comments_num = int(match_re.group(1)) # else: # comments_num = 0 # content = response.css('div.entry').extract_first() # 正文 # tag_selecter = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_selecter if not element.strip().endswith('评论')] # tags = ",".join(tag_list) # 标签 # # article_item = JobboleArticleItem() # article_item["title"] = title # try: # create_date = datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.now().date() # article_item["create_date"] = create_date # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_num # article_item["comment_nums"] = comments_num # article_item["fav_nums"] = fav_num # article_item["tags"] = tags # article_item["content"] = content # 通过item loader加载item 使用自定义的loader:ArticleItemLoader 由list变成str front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response = response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_details(self, response): """ 提取具体字段 """ # # 通过CSS选择器提取文章的具体字段,并添加到item中 # title = response.css('.entry-header h1::text').extract_first() # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·', '').strip() # # 数据库里定义的是date对象,所以这里要处理一下 # try: # create_date = self.pares_ymd(create_date) # except Exception as e: # create_date = datetime.now().date() # tag = response.css('.entry-meta-hide-on-mobile a::text').extract()[-1] # front_image_url = response.meta.get("front_image_url", "") # content = response.css("div.entry").extract_first() # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # # item对应字段填充值 # article_item = JobBoleArticleItem() # article_item["title"] = title # article_item["url"] = response.url # article_item["create_date"] = create_date # article_item["url_object_id"] = get_md5(response.url) # article_item["tag"] = tag # article_item["front_image_url"] = [front_image_url] # article_item["content"] = content # article_item["fav_nums"] = fav_nums # article_item["front_image_path"] = " " # 通过item loader加载item # 文章封面图 front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text") item_loader.add_css("tag", ".entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) article_item = item_loader.load_item() # 调用后传递到pipelines.py yield article_item
def parse_detail(self, response): item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", cutils.get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") yield item_loader.load_item()
def parse_comment(self, response): #上个商品的url detail_url = response.meta.get("detail_url", "") #上个商品的外键 uid = response.meta.get("uid", "") self.brower.get(detail_url) self.brower.execute_script("window.scrollTo(0,1000)") self.brower.find_element_by_css_selector("li[data-anchor=\"#comment\"]").click() import time time.sleep(2) select=Selector(text=self.brower.page_source) #商品简介 shopItems=select.css("ul.parameter2.p-parameter-list li") shopMes="" for shopItem in shopItems: text=shopItem.css("::text").extract_first() shopMes+=text+";" buySourse=select.xpath(".//*[@id='buy-rate']/a/text()").extract_first("无购买指数") #累计所占比例 items="" buyItems=select.css("ul.filter-list li a") for index in range(len(buyItems)): items+=buyItems[index].css("::text").extract_first() items+=buyItems[index].css("em::text").extract_first()+";" #商品分数 goodtext=select.css("div.comment-percent strong.percent-tit::text").extract_first("未获取到!") source=select.xpath(".//*[@id='comment']/div[2]/div[1]/div[1]/div/text()").extract_first("100")+"%" #商品评价分数 shopSourse=goodtext+":"+source shopMessage=select.css("div.tag-list span") # 商品评价具体参数 ShopParameter="" if shopMessage: for shopmess in shopMessage: ShopParameter+=shopmess.css("::text").extract_first()+";" else: ShopParameter = "暂无评论记录" #添加数据 items_loder=ArticleItemLoader(item=JDCommentItem(),response=response) items_loder.add_value("uid",uid) items_loder.add_value("shopParams",shopMes) items_loder.add_value("buy_sourse",buySourse) items_loder.add_value("user_comment",items) items_loder.add_value("good_sourse",shopSourse) items_loder.add_value("user_comment_Detail",ShopParameter) yield items_loder.load_item()
def parse_detail(self, response): article_item = QiushibaikeItem() # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 flagTrue = response.meta.get("flag", "") # 标识 original = "http://www.tuicool.com/" + response.css("span.from a::attr(href)").extract_first("") item_loader = ArticleItemLoader(item=QiushibaikeItem(), response=response) item_loader.add_css("title", ".article_row_fluid div:nth-child(1) h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "span.timestamp::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("sites", original) item_loader.add_value("flag", flagTrue) item_loader.add_css("original", "div.source a::text") item_loader.add_css("tags", "span.new-label::text") item_loader.add_css("content", "div.article_body") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 通过itemloder front_image_url = response.meta.get('front_image_url', '') item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_value('url', response.url) item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_num', '.vote-post-up h10::text') item_loader.add_css('fav_num', '.bookmark-btn::text') item_loader.add_css('com_num', "a[href='#article-comment'] span::text") item_loader.add_css('content', 'div.entry') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') article_item = item_loader.load_item() yield article_item
def parse_question(self, response): zhihu_id = response.meta.get("question_id", "") question_item_loader = ArticleItemLoader(item=ZhihuQuestionItem(), response=response) question_item_loader.add_css("title", "h1.QuestionHeader-title::text") question_item_loader.add_css("content", ".QuestionHeader-detail") question_item_loader.add_value("url", response.url) question_item_loader.add_value("zhihu_id", zhihu_id) question_item_loader.add_css("answer_num", ".List-headerText span::text") question_item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") question_item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") question_item_loader.add_css( "topics", ".QuestionHeader-topics .Popover div::text") question_item = question_item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(zhihu_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse(self, response): if response.status != 200 or len(response.text) == 0: logger.error('Current response is invalid') return # 解析当前响应数据的标题 ArticlePage = response.xpath( '//div[@class="grid-8"]/div[@class="post floated-thumb"]') if ArticlePage is not None: logger.info( 'Current artticle list len is {0} and type is {1}'.format( len(ArticlePage), type(ArticlePage))) # 解析 for Article in ArticlePage: itemloader = ArticleItemLoader(response=response, item=LpythonspiderItem()) articlethumb = Article.xpath( './div[@class="post-thumb"]/a/img/@src').extract() if articlethumb is None: logger.info('cur article no cover') itemloader.add_value('thumb', 'No cover') else: itemloader.add_value('thumb', articlethumb) articletitle = Article.xpath( './div[@class="post-meta"]/p/a[@class="archive-title"]/text()' ).extract() itemloader.add_value('title', articletitle) articledate = Article.xpath( './div[@class="post-meta"]/p/text()').extract() itemloader.add_value('date', articledate) articletype = Article.xpath( './div[@class="post-meta"]/p/a[@rel="category tag"]/text()' ).extract() itemloader.add_value('type', articletype) articlesummary = Article.xpath( './div[@class="post-meta"]/span[@class="excerpt"]/p/text()' ).extract() itemloader.add_value('summary', articlesummary) articlelink = Article.xpath( './div[@class="post-meta"]/p/a[@class="archive-title"]/@href' ).extract() itemloader.add_value('link', articlelink) articleobjectid = common.get_md5(articlelink) itemloader.add_value('object_id', articleobjectid) yield itemloader.load_item() if self.already_push_all_request is not True: page_list_html_a = response.xpath( '//div[@class="grid-8"]/div[@class="navigation margin-20"]/a[@class="page-numbers"]' ) last_page_list_html_a = page_list_html_a[-1] last_page_index = last_page_list_html_a.xpath( 'text()').extract_first() print(type(last_page_index)) last_index_number = int(last_page_index) print last_index_number format_url = 'http://python.jobbole.com/all-posts/page/{0}/' next_page_index = 2 while next_page_index <= last_index_number: next_page_request_url = format_url.format(next_page_index) print(' will lpush to redis and url is %s' % next_page_request_url) yield Request(url=next_page_request_url) next_page_index += 1 self.already_push_all_request = True
def parse_detail(self, response): article_item = JobBoleArticleItem() #提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # # content = response.xpath("//div[@class='entry']").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) #通过css选择器提取字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): ''' 解析文章详情 :param response: :return: ''' # 实例化item article_item = JobBoleArticleItem() # 使用xpath来获取数据 # #// *[ @ id = "post-114041"] / div[1] / h1 # title = response.xpath("//*[@id='post-114041']/div/h1/text()").extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first("").strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first("") # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first("") # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # comment_nums = response.xpath("//a[href='#article-comment']/span/text()").extract_first("") # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # content = response.xpath("//div[class='entry']").extract()extract_first("") # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract_first("") # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # 使用css选择器来获取数据 # extract_first函数:extract返回的是一个数组,数组有可能为空,所以取第0个值得时候会报错,调用这个函数可以在取不到值得时候给一个默认值 # title = response.css(".entry-header h1::text").extract_first("") # create_date = response.css(".entry-meta-hide-on-mobile::text").extract_first("").strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract_first("") # fav_nums = response.css(".bookmark-btn::text").extract_first("") # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("") # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css("div.entry").extract_first("") # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) front_image_url = response.meta.get("front_image_url", "") # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_path"] #通过item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text") item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): #提取文章具体字段(xpath) # title = response.xpath('//*[@id="post-113789"]/div[1]/h1/text()').extract()[0] # # create_date = response.xpath('//*[@id="post-113789"]/div[2]/p/text()[1]').extract()[0].strip().replace('·', '').strip() # # praise_nums = response.xpath('//*[@id="113789votetotal"]/text()').extract() # if praise_nums: # praise_nums = int(praise_nums[0]) # else: # praise_nums = 0 # # fav_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/span[2]/text()').extract()[0] # match_re = re.match(r'.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/a/span/text()').extract()[0] # match_re = re.match(r'.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath('//*[@id="post-113789"]/div[3]').extract()[0] # # tag_list = response.xpath('//*[@id="post-113789"]/div[2]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) #以下通过css选择器提取字段 # article_item = JobboleArticleItem() #实例化 # # front_image_url = response.meta.get('front_image_url', '') #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空) # #文章封面图 # # title = response.css('.entry-header h1::text').extract()[0] # # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip() # # praise_nums = response.css('.vote-post-up h10::text').extract_first() # if praise_nums: # praise_nums = int(praise_nums[0]) # else: # praise_nums = 0 # # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match(r'.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(r'.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # #填充值到items # article_item['title'] = title # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) #对url做MD5 # # try: #为了将文章的创建时间写入数据库,要把str类型的create_time转换为date类型 # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() #将格式为%Y/%m/%d 的str类型转换为date类型 # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # # article_item['front_image_url'] = [front_image_url] #images需要接受一个数组 # article_item['praise_nums'] = praise_nums # article_item['fav_nums'] = fav_nums # article_item['comment_nums'] = comment_nums # article_item['tags'] = tags # article_item['content'] = content #通过itemLoader加载item front_image_url = response.meta.get('front_image_url', '') #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空) #item_loader = ItemLoader(item=JobboleArticleItem(), response=response) #定义ItemLoader实例 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) #改用自定义的 ItemLoader # ItemLoader.add_css(self, field_name, css) # ItemLoader.add_xpath(self, field_name, xpath) # ItemLoader._add_value(self, field_name, value) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() #调用默认的item方法的话会有两个问题:1.值都是list 2.还需要对取出的值行进处理(做re的提取等) #-->去修改items.py #1.在items.py 的Field()里面用TakeFirst进行处理 2.在items.py 的Field()里面用MapCompose进行处理 yield article_item #调用yield之后,item会传递到pipelines.py
def parse_detail(self, response): """ 提取文章的具体字段, 回调函数 :param response: :return: """ article_item = JobBoleArticleItem() # 文章封面图 front_image_url = response.meta.get("front_image_url", "") """ ''' # 方法一:【通过XPath提取字段】 # 可以在浏览器inspect html里copy Xpath # chrome return 和 firefox return 可能不一样。 有时直接copy的值无法获得数据, 因为获取的是动态html而不是原始html # 标题 # extract_first() 就是 extract()[0],还可以传一个default值 title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # 创建时间 create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first("").strip().replace("·", "").strip() # 点赞数 vote_numbers = int(response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first("")) # 收藏数 bookmark_numbers = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first("") match_re = re.match(r".*?(\d+).*", bookmark_numbers) if match_re: bookmark_numbers = match_re.group(1) # 评论数 comment_numbers = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("") match_re = re.match(r".*?(\d+).*", comment_numbers) if match_re: comment_numbers = match_re.group(1) # 正文 (不提取text而是提取整个html结构) content = response.xpath("//div[@class='entry']").extract_first("") # 标签 tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() tags = ','.join([item for item in tags if not item.strip().endswith('评论')]) ''' # 方法二:【通过CSS选择器提取字段】 # 标题 # ::text 代表去text title = response.css(".entry-header h1::text").extract_first("") # 创建时间 create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first("").strip().replace("·","").strip() # 点赞数 vote_numbers = int(response.css(".vote-post-up h10::text").extract_first("")) # 收藏数 bookmark_numbers = response.css(".bookmark-btn::text").extract_first("") match_re = re.match(r".*?(\d+).*", bookmark_numbers) if match_re: bookmark_numbers = int(match_re.group(1)) else: bookmark_numbers = 0 # 评论数 comment_numbers = response.css("a[href='#article-comment'] span::text").extract_first("") match_re = re.match(r".*?(\d+).*", comment_numbers) if match_re: comment_numbers = int(match_re.group(1)) else: comment_numbers = 0 # 正文 (不提取text而是提取整个html结构) content = response.css("div.entry").extract_first("") # 标签 tags = response.css('p.entry-meta-hide-on-mobile a::text').extract() tags = ','.join([item for item in tags if not item.strip().endswith('评论')]) # 赋值生成item article_item['url_object_id'] = get_md5(response.url) article_item['title'] = title article_item['url'] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item['create_date'] = create_date # scrapy 的image下载接受的是数组 article_item['front_image_url'] = [front_image_url] article_item['vote_numbers'] = vote_numbers article_item['bookmark_numbers'] = bookmark_numbers article_item['comment_numbers'] = comment_numbers article_item['tags'] = tags article_item['content'] = content """ # 通过item loader 加载 item # 更加简洁可配置 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("vote_numbers", ".vote-post-up h10::text") item_loader.add_css("bookmark_numbers", ".bookmark-btn::text") item_loader.add_css("comment_numbers", "a[href='#article-comment'] span::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # xpath选择器,提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # creat_data = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace("·", "") # praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first() # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn ")]/text()').extract_first() # match_re = re.match(".*?(\d+).*", "fav_nums") # if match_re: # fav_nums = match_re.group(1) # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # match_re = re.match(".*?(\d+).*", "comment_nums") # if match_re: # comment_nums = match_re.group(1) # content = response.xpath('//div[@class="entry"]').extract_first() # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # # endswith() 方法用于判断字符串是否以指定后缀结尾,如果以指定后缀结尾返回True,否则返回False # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # .join() 连接字符串数组,将字符串、元组、列表中的元素以指定的字符(分隔符)连接生成一个新的字符串 # # css选择器 # title = response.css('.entry-header h1::text').extract_first("") # create_data = response.css('p.entry-meta-hide-on-mobile ::text').extract_first("").strip().replace("·", "") # praise_nums = response.css('.vote-post-up h10::text').extract_first("") # fav_nums = response.css('.bookmark-btn ::text').extract_first("") # match_re = re.match(".*?(\d+).*", "fav_nums") # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css('a[href="#article-comment"] ::text').extract_first("") # match_re = re.match(".*?(\d+).*", "comment_nums") # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css('.entry').extract_first("") # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["title"] = title # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # # try: # # create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date() # # except Exception as e: # # create_data = datetime.datetime.now().date() # article_item["create_data"] = create_data # article_item["praise_nums"] = praise_nums # article_item["fav_nums"] = fav_nums # article_item["front_image_url"] = [front_image_url] # article_item["comment_nums"] = comment_nums # article_item["content"] = content # article_item["tags"] = tags # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_data", "p.entry-meta-hide-on-mobile ::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn ::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] ::text") item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text") item_loader.add_css("content", ".entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 提取文章的具体字段 article_item = JobboleArticleItem() # 图片 image = response.meta.get("front_img", "") # # # 标题 # title = response.xpath("//div[@class='entry-header']/h1/text()") # title_result = title.extract_first("") # # 创建时间 # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()") # create_date_result = create_date.extract()[0].replace("·", "").strip() # # 点赞数 # praise_num = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()") # praise_num_result = int(praise_num.extract()[0]) # # 收藏数 # fav_num = response.xpath("//span[contains(@class,'bookmark-btn')]/text()") # match_re = re.match(r".*?(\d+).*", fav_num.extract()[0]) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # # # 评论数 # comment_num = response.xpath("//a[@href='#article-comment']/span/text()") # match_re = re.match(".*?(\d+).*", comment_num.extract()[0]) # if match_re: # comment_num = int(match_re.group(1)) # else: # comment_num = 0 # # 文章内容 # # article_content=response.xpath("//") # content = response.xpath("//div[@class='entry']//text()").extract() # content_data = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # content_key = [content_key for content_key in content_data if not content_key.strip().endswith("评论")] # content_keys = ','.join(content_key) # # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) # article_item['front_img_url'] = [image] # article_item['title'] = title_result # try: # create_date_result = datetime.datetime.strptime(create_date_result, '%Y/%m/%d').date() # except Exception as e: # create_date_result = datetime.datetime.now() # article_item['create_time'] = create_date_result # article_item['praise_num'] = praise_num_result # article_item['fav_num'] = fav_num # article_item['comment_num'] = comment_num # article_item['content'] = content # article_item['tags'] = content_keys # 通过item loader 加载item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_value('url', response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_img_url", [image]) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_xpath("create_time", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_xpath("praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath("fav_num", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath("comment_num", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("content", "//div[@class='entry']//text()") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") article_item= item_loader.load_item() yield article_item