Ejemplo n.º 1
0
    def parse_detail(self, response):
        w3c_item = W3cItem()

        sub_title = response.meta.get("sub_title")
        url = response.url
        content = self.process_data(
            response.xpath("//div[@class = 'content-bg']//text()").getall())
        url_object_id = get_md5(url)

        w3c_item['content'] = content
        w3c_item['sub_title'] = sub_title
        w3c_item['url'] = url
        w3c_item['url_object_id'] = url_object_id
        yield w3c_item
Ejemplo n.º 2
0
    def parse(self, response):
        oschina_item = OSChinaItem()
        oschina_itemloader = OSChinaItemLoader(item=oschina_item, response=response)

        oschina_itemloader.add_value("url", response.url)
        oschina_itemloader.add_value("url_object_id", get_md5(response.url))
        oschina_itemloader.add_xpath("title", "//div[@class = 'article-detail']//h2[@class = 'header']/text()")
        content = self.process_data(response.xpath("//div[@id = 'articleContent']/*[not(@class)]").getall())
        post_date = self.date_produce(response.xpath("//div[contains(@class,'meta-wrap')]/div[1]/text()").getall())

        oschina_item['content'] = content
        oschina_item['post_date'] = post_date
        oschina_itemloader.load_item()
        yield oschina_item
Ejemplo n.º 3
0
    def parse(self, response):
        runoob_item = RunoobItem()
        sub_title = self.process_data(response.xpath("//div[@id = 'content']//h1[1]//text()").getall())
        if sub_title == "":
            sub_title = "菜鸟教程"
        content = self.process_data(response.xpath("//div[@id = 'content']//*//text()").getall())
        url = response.url
        url_object_id = get_md5(url)

        runoob_item['content'] = content
        runoob_item['sub_title'] = sub_title
        runoob_item['url'] = url
        runoob_item['url_object_id'] = url_object_id
        yield runoob_item
Ejemplo n.º 4
0
    def parse(self, response):
        bole_item = BoleItem()

        '''itemloader方式抽取'''
        item_loader = BoleItemLoader(item=bole_item, response=response)

        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("post_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_css("content", "div.entry")

        bole_item = item_loader.load_item()

        yield bole_item
Ejemplo n.º 5
0
    def parse(self, response):
        itpub_item = ItPubItem()
        itpub_loader = ItPubItemLoader(item=itpub_item, response=response)

        itpub_loader.add_css("title", "h1.preview-title::text")
        itpub_loader.add_xpath("post_date",
                               "//div[@class = 'mess']/span[3]//text()")
        itpub_loader.add_value("url", response.url)
        itpub_loader.add_value("url_object_id", get_md5(response.url))
        content = self.process_data(
            response.xpath("//div[@class = 'preview-main']//text()").getall())

        itpub_item["content"] = content
        itpub_item = itpub_loader.load_item()

        yield itpub_item
Ejemplo n.º 6
0
    def parse_detail(self, response):
        # 当程序爬取20000行数据时中断爬取
        # break_crawl()

        cnblog_item = CnblogItem()
        cnblog_loader = CnblogItemLoader(item=cnblog_item, response=response)
        # 从meta中获取上一级的文章标题
        title = response.meta.get("title")
        # 解析文章内容
        content = self.process_data(response.xpath("//div[@id = 'cnblogs_post_body']//*//text()").getall())
        cnblog_loader.add_xpath("post_date", "//span[@id = 'post-date']//text()")
        cnblog_loader.add_value("url", response.url)
        cnblog_loader.add_value("url_object_id", get_md5(response.url))

        cnblog_item['title'] = title
        cnblog_item['content'] = content
        cnblog_item = cnblog_loader.load_item()
        yield cnblog_item
Ejemplo n.º 7
0
    def parse(self, response):
        itcto_item = ItCtoItem()
        itcto_loader = ItCtoItemLoader(item=itcto_item, response=response)

        itcto_loader.add_css("title", "h1.artical-title::text")
        itcto_loader.add_css("post_date",
                             "div.artical-title-list a.time::text")
        # 阅读人数
        # itcto_loader.add_css("read_count", "div.artical-title-list a.read::text")
        itcto_loader.add_value("url", response.url)
        itcto_loader.add_value("url_object_id", get_md5(response.url))
        content = self.process_data(
            response.xpath(
                "//div[contains(@class,'artical-content')]//text()").getall())

        itcto_item["content"] = content
        itcto_item = itcto_loader.load_item()

        yield itcto_item
Ejemplo n.º 8
0
    def parse(self, response):
        course_bai_item = CourseBaiItem()

        sub_title = self.process_data(
            response.xpath(
                "//div[@id = 'navs']//h1[@class = 'article-title']//text()").
            get())
        content = self.process_data(
            response.xpath(
                "//div[@class = 'article-content']/*[not(@style)]//text()").
            getall())
        url = response.url
        url_object_id = get_md5(url)

        course_bai_item['sub_title'] = sub_title
        course_bai_item['content'] = content
        course_bai_item['url'] = url
        course_bai_item['url_object_id'] = url_object_id

        yield course_bai_item