def parse_detail(self, response): w3c_item = W3cItem() sub_title = response.meta.get("sub_title") url = response.url content = self.process_data( response.xpath("//div[@class = 'content-bg']//text()").getall()) url_object_id = get_md5(url) w3c_item['content'] = content w3c_item['sub_title'] = sub_title w3c_item['url'] = url w3c_item['url_object_id'] = url_object_id yield w3c_item
def parse(self, response): oschina_item = OSChinaItem() oschina_itemloader = OSChinaItemLoader(item=oschina_item, response=response) oschina_itemloader.add_value("url", response.url) oschina_itemloader.add_value("url_object_id", get_md5(response.url)) oschina_itemloader.add_xpath("title", "//div[@class = 'article-detail']//h2[@class = 'header']/text()") content = self.process_data(response.xpath("//div[@id = 'articleContent']/*[not(@class)]").getall()) post_date = self.date_produce(response.xpath("//div[contains(@class,'meta-wrap')]/div[1]/text()").getall()) oschina_item['content'] = content oschina_item['post_date'] = post_date oschina_itemloader.load_item() yield oschina_item
def parse(self, response): runoob_item = RunoobItem() sub_title = self.process_data(response.xpath("//div[@id = 'content']//h1[1]//text()").getall()) if sub_title == "": sub_title = "菜鸟教程" content = self.process_data(response.xpath("//div[@id = 'content']//*//text()").getall()) url = response.url url_object_id = get_md5(url) runoob_item['content'] = content runoob_item['sub_title'] = sub_title runoob_item['url'] = url runoob_item['url_object_id'] = url_object_id yield runoob_item
def parse(self, response): bole_item = BoleItem() '''itemloader方式抽取''' item_loader = BoleItemLoader(item=bole_item, response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("post_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_css("content", "div.entry") bole_item = item_loader.load_item() yield bole_item
def parse(self, response): itpub_item = ItPubItem() itpub_loader = ItPubItemLoader(item=itpub_item, response=response) itpub_loader.add_css("title", "h1.preview-title::text") itpub_loader.add_xpath("post_date", "//div[@class = 'mess']/span[3]//text()") itpub_loader.add_value("url", response.url) itpub_loader.add_value("url_object_id", get_md5(response.url)) content = self.process_data( response.xpath("//div[@class = 'preview-main']//text()").getall()) itpub_item["content"] = content itpub_item = itpub_loader.load_item() yield itpub_item
def parse_detail(self, response): # 当程序爬取20000行数据时中断爬取 # break_crawl() cnblog_item = CnblogItem() cnblog_loader = CnblogItemLoader(item=cnblog_item, response=response) # 从meta中获取上一级的文章标题 title = response.meta.get("title") # 解析文章内容 content = self.process_data(response.xpath("//div[@id = 'cnblogs_post_body']//*//text()").getall()) cnblog_loader.add_xpath("post_date", "//span[@id = 'post-date']//text()") cnblog_loader.add_value("url", response.url) cnblog_loader.add_value("url_object_id", get_md5(response.url)) cnblog_item['title'] = title cnblog_item['content'] = content cnblog_item = cnblog_loader.load_item() yield cnblog_item
def parse(self, response): itcto_item = ItCtoItem() itcto_loader = ItCtoItemLoader(item=itcto_item, response=response) itcto_loader.add_css("title", "h1.artical-title::text") itcto_loader.add_css("post_date", "div.artical-title-list a.time::text") # 阅读人数 # itcto_loader.add_css("read_count", "div.artical-title-list a.read::text") itcto_loader.add_value("url", response.url) itcto_loader.add_value("url_object_id", get_md5(response.url)) content = self.process_data( response.xpath( "//div[contains(@class,'artical-content')]//text()").getall()) itcto_item["content"] = content itcto_item = itcto_loader.load_item() yield itcto_item
def parse(self, response): course_bai_item = CourseBaiItem() sub_title = self.process_data( response.xpath( "//div[@id = 'navs']//h1[@class = 'article-title']//text()"). get()) content = self.process_data( response.xpath( "//div[@class = 'article-content']/*[not(@style)]//text()"). getall()) url = response.url url_object_id = get_md5(url) course_bai_item['sub_title'] = sub_title course_bai_item['content'] = content course_bai_item['url'] = url course_bai_item['url_object_id'] = url_object_id yield course_bai_item