コード例 #1
0
    def parse_article(self, response):
        article_div = response.css("#main-article-body article.main-article")

        url = response.url
        lang = self.lang
        title = article_div.css("#headline ::text").extract_first()
        category = response.meta["category"]
        intro = article_div.css(".cms-desc ::text").extract_first()
        content = ' '.join(article_div.css("#article-body ::text").extract())
        time = article_div.css(
            "#article-meta .byline-dateline time::text").extract_first()

        # Transform time to uniform format
        if time is not None:
            time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y %H:%M")

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(
                self.name, self.article_scraped_count))

        yield Article(url=url,
                      lang=lang,
                      title=title,
                      category=category,
                      intro=intro,
                      content=content,
                      time=time)
コード例 #2
0
    def parse_article(self, response):
        url = response.url
        lang = self.lang
        category = response.meta["category"]
        title = response.css(".article-title ::text").extract_first()
        intro = ' '.join(response.css(".article-detail-hd>p ::text").extract())
        content = ' '.join(
            response.xpath(
                "//div[@class='article-detail']//"
                "p[not(ancestor::div[@class='article-detail-hd'])]//text()").
            extract()[:-2])
        time = response.css(".datetime span::text").extract()

        # Transform time to uniform format
        if time is not None:
            time = "".join(time)
            time = self.transform_time_fmt(time, src_fmt="%H:%M %d/%m/%Y")

        self.article_scraped_count += 1
        self.print_num_scraped_items(every=20)

        yield Article(url=url,
                      lang=lang,
                      title=title,
                      category=category,
                      intro=intro,
                      content=content,
                      time=time)
コード例 #3
0
    def parse_article(self, response):
        meta = response.meta

        url = response.url
        lang = self.lang
        title = response.css(
            ".postDetail>.detail_product .product_name::text").extract_first()
        category = meta["category"]
        intro = ""
        content = ' '.join(
            response.css(
                ".postDetail .full_description_inside ::text").extract())
        time = ""

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(
                self.name, self.article_scraped_count))

        yield Article(url=url,
                      lang=lang,
                      title=title,
                      category=category,
                      intro=intro,
                      content=content,
                      time=time)
コード例 #4
0
    def parse_article(self, response):
        content_div = response.xpath("//div[@id='ctl00_IDContent_ctl00_divContent']")

        url = response.url
        lang = self.lang
        title = content_div.css("h1.fon31.mgb15::text").extract_first()
        category = response.meta["category"]
        intro = ' '.join(content_div.css("h2.fon33::text").extract())
        content = ' '.join(content_div.css("#divNewsContent ::text").extract())
        time = content_div.css("div.box26>span::text").extract_first()

        # Transform time to uniform format
        if time is not None:
            time = time[time.find(", ") + 2:]
            time = '_'.join(time.split(" - "))
            time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y_%H:%M")

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count))
        
        yield Article(
            url=url,
            lang=lang,
            title=title,
            category=category,
            intro=intro,
            content=content,
            time=time
        )
コード例 #5
0
    def parse_article(self, response):
        section = response.css("section.sidebar_1")

        url = response.url
        lang = self.lang
        title = section.css(".title_news_detail::text").extract_first()
        category = response.meta["category"]
        intro = section.css(".description::text").extract_first()
        content = section.css("article.content_detail ::text").extract()
        content = ' '.join(content)
        time = section.css("span.time::text").extract()

        # Transform time to uniform format
        if time is not None:
            time = ", ".join(time)
            time = time.split(", ")
            time[-1] = time[-1][:5]
            time = '_'.join(time[1:])
            time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y_%H:%M")

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count))
        
        yield Article(
            url=url,
            lang=lang,
            title=title,
            category=category,
            intro=intro,
            content=content,
            time=time
        )
コード例 #6
0
    def parse_article_type1(self, response):
        # Example: game.thanhnien.vn

        url = response.url
        lang = self.lang
        title = response.css(".main-title::text").extract_first()
        category = response.meta["category"]
        intro = ' '.join(
            response.css(".details-content .sapo ::text").extract())
        content = ' '.join(
            response.xpath(
                "//div[@id='abody']//text()[not(ancestor::script)]").extract())
        time = response.css(".details-heading time ::text").extract_first()

        # Transform time to uniform format
        if time is not None:
            time = time.split()
            time = ' '.join([time[0], time[-1]])
            time = self.transform_time_fmt(time, src_fmt="%H:%M %d/%m/%Y")

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(
                self.name, self.article_scraped_count))

        yield Article(url=url,
                      lang=lang,
                      title=title,
                      category=category,
                      intro=intro,
                      content=content,
                      time=time)
コード例 #7
0
    def parse_article(self, response):
        article_div = response.css("#main-content article")

        url = response.url
        lang = self.lang
        title = article_div.css(".post-title ::text").extract_first()
        category = response.meta["category"]
        intro = ''
        content = ' '.join(
            article_div.xpath(
                ".//div[@class='entry']//text()[not(ancestor::script)]").
            extract())
        time = article_div.css(".updated ::text").extract_first()

        # Transform time to uniform format
        if time is not None:
            time = self.transform_time_fmt(time, src_fmt="%Y-%m-%d")

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(
                self.name, self.article_scraped_count))

        yield Article(url=url,
                      lang=lang,
                      title=title,
                      category=category,
                      intro=intro,
                      content=content,
                      time=time)
コード例 #8
0
    def parse_article(self, response):

        url = response.url
        lang = self.lang
        title = response.css(".header .title::text").extract_first()
        category = response.meta["category"]
        intro = ' '
        content = ' '.join(response.xpath("//div[@id='NewsContent']//text()[not(ancestor::script)]").extract())
        time = response.css(
            ".header .info_item_popup .note_gera:first-child span::text").extract_first()


        # Transform time to uniform format
        if time is not None:
            time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y")

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count))
        
        yield Article(
            url=url,
            lang=lang,
            title=title,
            category=category,
            intro=intro,
            content=content,
            time=time
        )
コード例 #9
0
    def parse_article(self, response):
        content_div = response.css(".contentleft")

        url = response.url
        lang = self.lang
        title = content_div.css(".titledetail h1::text").extract_first()
        category = response.meta["category"]
        intro = content_div.css(
            "#ContentRightHeight .sapo::text").extract_first()
        content = ' '.join(
            content_div.css(
                "#ContentRightHeight #divNewsContent ::text").extract())
        time = content_div.css(
            "#ContentRightHeight .ngayxuatban::text").extract_first()

        # Transform time to uniform format
        if time is not None:
            time = time.strip()
            time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y %H:%M")

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(
                self.name, self.article_scraped_count))

        yield Article(url=url,
                      lang=lang,
                      title=title,
                      category=category,
                      intro=intro,
                      content=content,
                      time=time)
コード例 #10
0
    def parse_article(self, response):
        table = response.css("div.media table")

        url = response.url
        lang = self.lang
        title = table.css("div.ndtitle ::text").extract_first()
        category = response.meta["category"]
        intro = table.css("div.ndcontent.ndb p ::text").extract_first()
        content = table.css("div[class=ndcontent] ::text").extract()
        content = ' '.join(content)
        time = table.css("div.icon_date_top>div.pull-left::text").extract_first()

        # Transform time to uniform format
        if time is not None:
            time = '_'.join(time.split(", ")[1:])
            time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y_%H:%M:%S")

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count))
        
        yield Article(
            url=url,
            lang=lang,
            title=title,
            category=category,
            intro=intro,
            content=content,
            time=time
        )
コード例 #11
0
    def parse_article(self, response):
        meta = response.meta
        section = response.css("section.sidebar_1")

        url = response.url
        lang = self.lang
        title = meta["title"]
        category = meta["category"]
        intro = meta["intro"]
        content = ' '.join(response.xpath("//div[@id='ArticleContent']//text()").extract())
        time = meta["time"]

        self.article_scraped_count += 1
        if self.article_scraped_count % 100 == 0:
            self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count))
        
        yield Article(
            url=url,
            lang=lang,
            title=title,
            category=category,
            intro=intro,
            content=content,
            time=time
        )