def parse_post(self, response):
     author = self.parse_author(response)
     item = NewsItem(
         title=response.xpath('//h1/text()').get().strip(),
         timestamp='',
         content_html=response.xpath('//div[@class="content_detail"]').get(),
         body=self.parse_body(response),
         link=response.url,
         subhead=response.xpath('//div[@class="teaser_detail"]/text()').get() if response.xpath(
             '//div[@class="teaser_detail"]/text()').get() else '',
         pic=self.parse_pictures(response),
         date='',
         author=author
     )
     yield item
Example #2
0
 def parse_post(self, response):
     author = self.parse_author(response)
     time_format, short_date = self.parse_date(response)
     item = NewsItem(
         title=response.xpath('//*[@id="ARTICLEVIEW"]//p[@class="SGTOTitle"]/text()').get(),
         timestamp=time_format,
         content_html=response.xpath('//*[@class="Content"]').get(),
         body=html2text.html2text(response.xpath('//*[@class="Content"]').get()),
         link=response.url,
         subhead=response.xpath('//div[@id="ARTICLEVIEW"]//*[@class="SGTOSummary"]/text()').get().strip(),
         pic=self.parse_pictures(response),
         date=short_date,
         author=author
     )
     yield item
 def parse_post(self, response):
     author = self.parse_author(response)
     time_format, short_date = self.parse_date(response)
     item = NewsItem(
         title=response.xpath('//h1/text()').get(),
         timestamp=time_format,
         content_html=response.xpath('//section[@class="cb-entry-content clearfix"]').get(),
         body=html2text.html2text(response.xpath('//section[@class="cb-entry-content clearfix"]').get()),
         link=response.url,
         subhead='',
         pic=self.parse_pictures(response),
         date=short_date,
         author=author
     )
     yield item
Example #4
0
 def parse_post(self, response):
     author = self.parse_author(response)
     time_format, short_date = self.parse_date(response)
     item = NewsItem(
         title=response.xpath('//h2[@class="post-title"]/text()').get(),
         timestamp=time_format,
         content_html=response.xpath('//div[@class="elementor-widget-wrap"]').get(),
         body=html2text.html2text(response.xpath('//div[@class="elementor-widget-wrap"]').get()),
         link=response.url,
         subhead='',
         pic=self.parse_pictures(response),
         date=short_date,
         author=author
     )
     yield item
Example #5
0
 def parse_post(self, response):
     time_format, short_date = self.parse_date(response)
     content, html = self.parse_content(response)
     item = NewsItem(title=html2text.html2text(
         response.xpath('//*[@id="pt1:pbl16"]').get()),
                     timestamp=time_format,
                     content_html=html,
                     body=content,
                     link=response.url,
                     subhead=html2text.html2text(
                         response.xpath('//*[@id="pt1:pbl18"]').get()),
                     pic='',
                     date=short_date,
                     author='')
     yield item
 def parse_post(self, response):
     author = self.parse_author(response)
     time_format, short_date = self.parse_date(response)
     item = NewsItem(
         title=response.xpath('//h1/text()').get(),
         timestamp=time_format,
         content_html=response.xpath('//div[@class="description"]').get(),
         body=html2text.html2text(response.xpath('//div[@class="description"]').get()),
         link=response.url,
         subhead=response.xpath('//div[@class="shortDesc"]/text()').get().strip(),
         pic=self.parse_pictures(response),
         date=short_date,
         author=author
     )
     yield item
 def parse_post(self, response):
     time_format, short_date = self.parse_date(response)
     item = NewsItem(
         title=response.xpath('//h1[@itemprop="headline"]/text()').get(),
         timestamp=time_format,
         content_html=response.xpath(
             '//div[@class="content-detail"]').get(),
         body=html2text.html2text(
             response.xpath('//div[@class="content-detail"]').get()),
         link=response.url,
         subhead=html2text.html2text(
             response.xpath('//div[@class="brief-detail"]').get()),
         pic=self.parse_pictures(response),
         date=short_date,
         author=self.parse_author(response))
     yield item
    def parse_post(self, response):

        author = self.parse_author(response)
        time_format, short_date = self.parse_date(response)
        item = NewsItem(
            title=response.xpath('//h1/text()').get().strip(),
            timestamp=time_format,
            content_html=response.xpath(
                '//*[@id="cotent_detail"]').get(),
            body=self.parse_body(response),
            link=response.url,
            subhead=response.xpath('//div[@class="sapo_detail fr"]/text()').get().strip(),
            pic=self.parse_pictures(response),
            date=short_date,
            author=author
        )
        yield item
    def parse_post(self, response):

        author = self.parse_author(response)
        time_format, short_date = self.parse_date(response)
        item = NewsItem(
            title=response.xpath('//h1/text()').get().strip(),
            timestamp=time_format,
            content_html=response.xpath(
                '//div[@class="article-content __MASTERCMS_CONTENT __MB_CONTENT_FOR_PRINTER"]'
            ).get(),
            body=self.parse_body(response),
            link=response.url,
            subhead=response.xpath(
                '//p[@class="detail-sapo"]/text()').get().strip(),
            pic=self.parse_pictures(response),
            date=short_date,
            author=author)
        yield item
Example #10
0
    def parse_post(self, response):
        """ This function returns newspaper articles on a given date in a given structure.

            Return data structure:
            [
                {
                    'title': string,                        # The title of a article
                    'author'                                # The author of a article, of '' if none
                    'subhead': string,                      # The subtitle of a article, or '' if there is no subtitle
                    'print': string,                        # The page number of a article
                    'date': string in '%Y-%m-%d' format,    # The publish date of a article
                    'body': string                          # The body of a article
                    'pic_list': string in                        # The link of pictures of a article, or '' if there are no pictures
                                                            f"{link1}|{text2}&&..." format
                    'original_link': string                          # The url of a article
                },
                ...
            ]
            or
            None

        :param response: The scrapy response
        :return:
        """
        author = self.parse_author(response)
        time_format, short_date = self.parse_date(response)
        item = NewsItem(
            title=response.xpath('//h1/text()').get(),
            # timestamp=time_format,
            print='',
            body=html2text.html2text(
                response.xpath(
                    '//div[@class="news_content entry-content"]').get()),
            original_link=response.url,
            subhead='',
            pic_list=self.parse_pictures(response),
            date=short_date,
            author=author,
            source='')
        # TODO: Parse article and yield it
        yield item
Example #11
0
    def parse_post(self, response):
        """ This function returns newspaper articles on a given date in a given structure.

                    Return data structure:
                    [
                        {
                            'title': string,                        # The title of a article
                            'author'                                # The author of a article, of '' if none
                            'subhead': string,                      # The subtitle of a article, or '' if there is no subtitle
                            'print': string,                        # The page number of a article
                            'date': string in '%Y-%m-%d' format,    # The publish date of a article
                            'timestamp': datetime                   # The ISO publish date of a article
                            'body': string                          # The body of a article
                            'pic': string in                        # The link of pictures of a article, or '' if there are no pictures
                                                                            f"{link1}|{text2}&&..." format
                            'link': string                          # The url of a article
                        },
                                ...
                    ]
                    or
                    None

                :param response: The scrapy response
                :return:
                """
        time_format, short_date = self.parse_date(response)
        content, html = self.parse_content(response)
        if response.xpath('//h1[@class="post-title main-title"]/text()').get():
            item = NewsItem(
                title=response.xpath(
                    '//h1[@class="post-title main-title"]/text()').get(),
                timestamp=time_format,
                content_html=content,
                body=html,
                link=response.url,
                subhead=response.xpath(
                    '//h2[@class="post-sapo"]/strong/text()').get(),
                pic=self.parse_pictures(response),
                date=short_date,
                author='')
            yield item
Example #12
0
    def parse_post(self, response):
        """ This function returns newspaper articles on a given date in a given structure.

            Return data structure:
            [
                {
                    'title': string,                        # The title of a article
                    'author'                                # The author of a article, of '' if none
                    'subhead': string,                      # The subtitle of a article, or '' if there is no subtitle
                    'print': string,                        # The page number of a article
                    'date': string in '%Y-%m-%d' format,    # The publish date of a article
                    'timestamp': datetime                   # The ISO publish date of a article
                    'body': string                          # The body of a article
                    'pic': string in                        # The link of pictures of a article, or '' if there are no pictures
                                                                    f"{link1}|{text2}&&..." format
                    'link': string                          # The url of a article
                },
                        ...
            ]
            or
            None

        :param response: The scrapy response
        :return:
        """

        time_format, short_date = self.parse_date(response)
        item = NewsItem(
            title=response.xpath('//h1[@title]/text()').get().strip(),
            # timestamp=time_format,
            # content_html=response.xpath('//*[@id="noidung"]').get(),
            body=html2text.html2text(
                response.xpath('//*[@id="noidung"]').get()),
            original_link=response.url,
            subhead=html2text.html2text(
                response.xpath('//*[@id="noidung"]/h2').get()),
            pic_list=self.parse_pictures(response),
            date=short_date,
            author=response.xpath('//cite/text()').get())
        yield item
    def parse_post(self, response):
        """ This function returns newspaper articles on a given date in a given structure.

                    Return data structure:
                    [
                        {
                            'title': string,                        # The title of a article
                            'author'                                # The author of a article, of '' if none
                            'subhead': string,                      # The subtitle of a article, or '' if there is no subtitle
                            'print': string,                        # The page number of a article
                            'date': string in '%Y-%m-%d' format,    # The publish date of a article
                            'timestamp': datetime                   # The ISO publish date of a article
                            'body': string                          # The body of a article
                            'pic': string in                        # The link of pictures of a article, or '' if there are no pictures
                                                                            f"{link1}|{text2}&&..." format
                            'link': string                          # The url of a article
                        },
                                ...
                    ]
                    or
                    None

                :param response: The scrapy response
                :return:
                """
        author = self.parse_author(response)
        time_format, short_date = self.parse_date(response)
        item = NewsItem(
            title=response.xpath('//h1/text()').get(),
            timestamp=time_format,
            content_html=response.xpath('//div[@class="col-md-12"]').get(),
            body=html2text.html2text(
                response.xpath('//div[@class="col-md-12"]').get()),
            link=response.url,
            subhead=response.xpath(
                '//div[@class="des f-roboto-b t-16-mb dt-des"]/text()').get(),
            pic=self.parse_pictures(response),
            date=short_date,
            author=author)
        yield item
Example #14
0
    def parse_post(self, response):
        """ This function returns newspaper articles on a given date in a given structure.

            Return data structure:
            [
                {
                    'title': string,                        # The title of a article
                    'author'                                # The author of a article, of '' if none
                    'subhead': string,                      # The subtitle of a article, or '' if there is no subtitle
                    'date': string in '%Y-%m-%d' format,    # The publish date of a article
                    'timestamp': string in Iso 8061 format  # The ISO publish date of a article
                    'body': string                          # The body of a article
                    'pic': string in                        # The link of pictures of a article, or '' if there are no pictures
                                                                    f"{link1}|{text2}&&..." format
                    'link': string                          # The url of a article
                },
                        ...
            ]
            or
            None

            :param response: The scrapy response
            :return:
            """

        content_html = response.xpath('//*[@id="abody"]').get()
        item = NewsItem(title=response.xpath(
            '//*[@class="details__headline"]/text()').get(),
                        timestamp=self.parse_timestamp(response, 'iso'),
                        content_html=content_html,
                        body=html2text(content_html),
                        link=response.url,
                        subhead=html2text(
                            response.xpath('//div[@class="sapo"]').get()),
                        pic=self.parse_pic(response),
                        date=self.parse_timestamp(response, 'date'),
                        author=self.parse_author(response))
        yield item
Example #15
0
 def parse_post(self, response):
     tags = response.xpath(
         '//div[@class="tags-container"]/ul/li/a/@title').extract()
     cate = response.xpath(
         '//meta[@property="article:section"]//@content').get()
     author = response.xpath('//div[@class="author"]').css(
         '::text').extract()[1] if response.xpath(
             '//div[@class="author"]').css('::text').extract()[1] else ''
     item = NewsItem(
         title=response.xpath('//h1[@class="article-title"]/text()').get(),
         timestamp=self.parse_timestamp(response),
         content_html=response.xpath(
             '//div[@class="main-content-body"]').get(),
         body=html2text.html2text(
             response.xpath('//div[@class="main-content-body"]').get()),
         tags=tags,
         category=cate,
         link=response.url,
         subhead=response.xpath('//h2[@class="sapo"]/text()').get(),
         pic=self.parse_pictures(response),
         date=self.parse_date(response),
         author=author)
     yield item