def parse_category(self, response): """解析分类页""" category_loader = DefaultItemLoader(item=CategoryItem(), response=response) category_loader.add_value('name', response.meta['link_text']) category_loader.add_value('parent', map_category(CATEGORY_MAP, response.url)) category_loader.add_value('category_type', 1) page_and_total = response.xpath( "//span[contains(@class, 'pagination-btn count')]/text()" ).extract_first(None) try: page_nums, total_nums = self.get_two(page_and_total) except: page_nums, total_nums = 0, 0 category_loader.add_value("publish_nums", total_nums) category_item = category_loader.load_item() yield category_item for page in self.parse_partial(range(1, page_nums + 1)): request_url = response.url + "#/all/default/0/{}/".format(page) yield scrapy.Request(url=request_url, callback=self.parse_page, dont_filter=True, meta={"url": request_url})
def parse_article(self, response): article_item_loader = DefaultItemLoader(ArticleItem(), response=response) article_item_loader.add_xpath("author", "//a[@class='up-name']/text()") article_item_loader.add_xpath( "cover_img_url", "//div[@class='banner-img-holder']/@style") article_item_loader.add_xpath("title", "//h1[@class='title']/text()") article_item_loader.add_xpath( "desc", "//div[contains(@class, 'article-holder')]/p/text()") article_item_loader.add_value("url", response.url) article_item_loader.add_value("cid", get_id(response.url)) article_item_loader.add_xpath( "img_box", "//figure[@class='img-box']/img/@data-src") article_item_loader.add_xpath( "views", "//div[@class='article-data']/span[1]/text()") article_item_loader.add_xpath( "likes", "//div[@class='article-data']/span[2]/text()") article_item_loader.add_xpath( "comments", "//div[@class='article-data']/span[3]/text()") article_item_loader.add_xpath( "coins", "//div[@class='coin-btn']/div/span/text()") article_item_loader.add_xpath( "collections", "//div[@class='fav-btn']/div/span/text()") article_item_loader.add_xpath( "shares", "//div[@class='share-btn']/div/span/text()") article_item_loader.add_xpath("publish_time", "//span[@class='create-time']/text()") article_item_loader.add_xpath( "category", "//a[@class='category-link']/span/text()") article_item_loader.add_xpath( "tags", "//li[@class='tag-item']/span[2]/text()") article_item = article_item_loader.load_item() yield article_item if PARSE_COMMENTS: for item in self.gen_comments_item(response): yield item
def parse_detail(self, response): """解析Video""" detail_loader = DefaultItemLoader(VideoItem(), response=response) detail_loader.add_value("vid", get_id(response.url)) detail_loader.add_xpath( "author", "//div[contains(@class, 'user')]/a[contains(@class, 'name')]/text()" ) detail_loader.add_xpath("title", "//h1/@title") detail_loader.add_xpath( "desc", "//div[@id='v_desc']/div[contains(@class, 'info')]") detail_loader.add_value("url", response.url) detail_loader.add_xpath( 'play_nums', "substring(//span[contains(@class, 'v play')]/@title, 5)") detail_loader.add_xpath( 'danmu_nums', "substring(//span[contains(@class, 'v dm')]/@title, 5)") detail_loader.add_xpath( 'coins', "substring(//span[@report-id='coinbtn1']/@title, 6)") detail_loader.add_xpath( 'collections', "substring(//span[@report-id='collect1']/@title, 5)") detail_loader.add_value('comments', response.meta.get('comments', 0)) detail_loader.add_xpath( 'shares', "//div[@id='playpage_share']//span[@class='num']/text()") detail_loader.add_value('likes', response.meta.get("likes", 0)) detail_loader.add_xpath("publish_time", "//time/text()") detail_loader.add_xpath( "category", "//div[contains(@class, 'tminfo')]/span/a/text()") detail_loader.add_xpath( "tags", "//ul[contains(@class, 'tag-area')]/li/a/text()") detail_item = detail_loader.load_item() self.crawler.stats.inc_value("detail_item") yield detail_item if PARSE_COMMENTS: for item in self.gen_comments_item(response): yield item
def _gen_reply_comments(self, response, comment, user, cnt): """ 回复评论产出 :param response: Response object :param comment: selenium element :param cnt: count for pagination :return: CommentItem """ browser = response.meta.get("browser") # 寻找回复楼层页数及节点 try: reply_page_element = comment.find_element_by_xpath( "div/div[@class='paging-box']/span[@class='result']") replys = comment.find_elements_by_xpath( "div/div[@class='reply-box']/div[contains(@class , 'reply-item')]" ) reply_page_num = GetNumber()(reply_page_element.text)[0] find_next = True except: reply_page_num = 2 replys = [] find_next = False # 由于回复楼层没有floor数, 需自行添加变量计数 floor = 1 for page in range(2, reply_page_num + 1): for reply in replys: # 楼中楼item rl_item_loader = DefaultItemLoader(item=CommentItem(), response=response) rl_user = reply.find_element_by_xpath( "div/div[@class='user']/a") rl_desc = reply.find_element_by_xpath( "div/div[@class='user']/span[@class='text-con']") rl_info = reply.find_element_by_xpath("div/div[@class='info']") rl_plad = "未知" rl_cur_time = rl_info.find_element_by_xpath( "span[@class='time']") rl_like = rl_info.find_element_by_xpath("span[@class='like']") rl_person, rl_desc.data = find_reply_person(rl_desc.text) rl_item_loader.add_value("source", response.url) rl_item_loader.add_value("sid", get_source(response.url)) rl_item_loader.add_value("person", rl_user.text) rl_item_loader.add_value( "desc", getattr(rl_desc, "data", rl_desc.text)) rl_item_loader.add_value("likes", rl_like.text) rl_item_loader.add_value("plat_from", rl_plad) rl_item_loader.add_value("reply_person", rl_person if rl_person else user.text) rl_item_loader.add_value("floor", floor) rl_item_loader.add_value("is_main", False) rl_item_loader.add_value("publish_time", createDatetime(rl_cur_time.text)) rl_item = rl_item_loader.load_item() floor += 1 yield rl_item # 如果页数大于1 if find_next: # self.browser.implicitly_wait(3) # self.browser.execute_script(self.script.replyPaginate.format(cnt)) browser.implicitly_wait(3) browser.execute_script(self.script.replyPaginate.format(cnt)) replys = comment.find_elements_by_xpath( "div/div[@class='reply-box']/div[contains(@class , 'reply-item')]" ) time.sleep(2)
def _gen_main_comments(self, response, comment, user): """ 产出主楼评论item :param response: Response object :param comment: selenium element :return: CommentItem """ desc = comment.find_element_by_xpath("div/p[@class='text']") # 评论信息 info = comment.find_element_by_xpath("div/div[@class='info']") floor = info.find_element_by_xpath("span[@class='floor']") try: plad = info.find_element_by_xpath("span[@class='plad']") except Exception: plad = None cur_time = info.find_element_by_xpath("span[@class='time']") like = info.find_element_by_xpath("span[@class='like']") # 主楼item comment_loader = DefaultItemLoader(item=CommentItem(), response=response) comment_loader.add_value("source", response.url) comment_loader.add_value("sid", get_source(response.url)) comment_loader.add_value("person", user.text) comment_loader.add_value("desc", desc.text) comment_loader.add_value("likes", like.text) comment_loader.add_value("plat_from", plad.text if plad else "PC端") comment_loader.add_value("reply_person", "") comment_loader.add_value("floor", floor.text) comment_loader.add_value("is_main", True) comment_loader.add_value("publish_time", createDatetime(cur_time.text)) comment_item = comment_loader.load_item() yield comment_item