Example #1
0
    def parse_article(self, response):
        article = {}

        # get ld_json
        try:
            ld_json = response.xpath(
                '//script[contains(text(),"NewsArticle")]/text()').get()
            ld_json_dict = json.loads(ld_json)
            ld_json_dict = time.timestamp_converter(ld_json_dict)
            article.update(ld_json_dict)
        except:
            pass

        # get meta elements
        elems = {
            'meta-description': response.xpath("//meta[@name='description']/@content").get(),
            'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(),
            'meta-title': response.xpath("//meta[@name='title']/@content").get(),
            'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(),
            'meta-author': response.xpath("//meta[@name='author']/@content").get(),
            'language': response.xpath('//meta[@http-equiv = "content-language"]/@content').get(),
            'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(),
            'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-article:author': response.xpath("//meta[@property='article:author']/@content").get(),
            'meta-article:publisher': response.xpath("//meta[@property='article:publisher']/@content").get(),
            'category': response.xpath('//p[@class = "the-article-category"]/a/text()').get(),
            'organization': 'zing',
            'related_urls': response.xpath('//div[@class = "article-list layout-grid-3"]//article/p/a/@href').getall(),
            'url': response.url
        }
        article.update(elems)
        article.update(response.meta['viral'])

        # get content
        content = ''
        for text in response.xpath('//*[@id="page-article"]/div[@class="page-wrapper"]/descendant::div[@class = "the-article-body"]/p/text()').getall():
            content += text.strip()
        article.update({'content': content})

        word_count = len(content.split())
        article.update({'word_count': word_count})

        # get image url
        images = {}
        for index, src in enumerate(response.xpath('//*[@id="page-article"]/div[@class="page-wrapper"]/descendant::table[@class = "picture"]//img/@src').getall(), 1):
            images.update({'image' + str(index): src})
        article.update({'image-urls': images})

        # get video url
        videos = {}
        for index, src in enumerate(response.xpath('//figure[@class="video cms-video"]/@data-video-src').getall(), 1):
            videos.update({'video' + str(index): src})
        article.update({'video urls': videos})

        # get comments
        id = response.xpath('//@article-id').get()
        cmt_request = "https://api.news.zing.vn/api/comment.aspx?action=get&id="+id
        yield scrapy.Request(cmt_request, callback=self.parse_comments, meta={'article': article})
Example #2
0
    def parse_article(self, response):
        article = dict()
        title = response.xpath('(//h1[@class="title_news_detail mb10"]/text())|(//h1[@class="title"]/text())').get()
        if title is not None:
            # get ld_json
            try:
                ld_json = response.xpath('//script[contains(text(),"NewsArticle")]/text()').get()
                ld_json = json.loads(ld_json)
                ld_json = time.timestamp_converter(ld_json)
                article.update(ld_json)
            except:
                pass
            if 'datePublished' not in article.keys():
                datePublished = response.xpath('(//meta[@name="pubdate"]/@content)').get()
                if datePublished is not None:
                    datePublished = datePublished.strip()
                    datePublished = time.Vnex_timestamp(datePublished)
                    article.update({'datePublished': datePublished})
                else:
                    datePublished = response.xpath('//meta[@name="its_publication"]/@content').get()
                    article.update({'datePublished': datePublished})
            if 'dateModified' not in article.keys():
                dateModified = response.xpath('(//meta[@itemprop="dateModified"]/@content)').get()
                if dateModified is not None:
                    dateModified = dateModified.strip()
                    dateModified = time.Vnex_timestamp(dateModified)
                    article.update({'dateModified': dateModified})
                else:
                    dateModified = response.xpath('//meta[@name="article_updatetime"]/@content').get()
                    article.update({'dateModified': dateModified})
            link = response.url
            article.update({'link': link, 'title': title})
            # get meta
            article.update({'type': response.xpath("//head/meta[@property='og:type']/@content").get()})
            article.update({'description': response.xpath("//head/meta[@name='description']/@content").get()})
            article.update({'keywords': response.xpath("//head/meta[@name='keywords']/@content").get()})
            article.update({'category': response.xpath("//head/meta[@property='article:section']/@content").get()})
            article.update({'copyright': response.xpath("//head/meta[@name='copyright']/@content").get()})
            article.update({'language': response.xpath("//head/meta[@name='Language']/@content").get()})
            article.update({'geo_place_name': response.xpath("//meta[@name = 'geo.placename']/@content").get()})
            article.update({'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get()})
            article.update({'geo_position': response.xpath("//meta[@name = 'geo.position']/@content").get()})
            article.update({'category': response.xpath("(//li[@class='start']/h4/a/text())|(//li[@class='start have_cap2 ']/h4/a/text())").get()})
            article.update({'organization': 'Vnexpress'})
            content = ''
            author = ''
            for text in response.xpath('(//section[@class="container"]/section[@class="wrap_sidebar_12"]/section['
                                   '@class="sidebar_1"]/article[@class="content_detail fck_detail width_common '
                                   'block_ads_connect"]/p[@class="Normal"]/strong/text())|(//p['
                                   '@class="author_mail"]/strong/text())|(//p['
                                   '@style="text-align:right;"]/strong/text())').getall():
                author += text.strip()
            article.update({'author': author})
            for text in response.xpath('(//article[@class="content_detail fck_detail width_common '
                                       'block_ads_connect"]/p/text())|(//div[@class="desc_cation"]/p/text())|(//div['
                                       '@class="desc_cation"]/p/strong/text())|(//div[contains(@class,'
                                       '"box_tableinsert") or contains(@class,"box_quangcao") or contains(@class,'
                                       '"box_brief_info")]//p//text())|(//div[@class="WordSection1"]/p/text())|(//td/p[@class="Image"]/text())').getall():
                content += text.strip()
            article.update({'content_article': content})
            if content is not None:
                word_count = len(content.split())
                article.update({'word_count': word_count})
            else:
                word_count = -1
                article.update({'word_count': word_count})
            # get image
            thumbnail = response.xpath('(//td/img/@src)|(//div[@class="item_slide_show clearfix"]/div/img/@src)').getall()
            if thumbnail is not None:
                article.update({'thumbnail': thumbnail})
            else:
                article.update({'thumbnail': '-1'})
            # get relate_url
            relate_urls = []
            htags = response.xpath('//ul[@class="list_title"]/li/a[@data-event-action="article_box_related"]')
            for tag in htags:
                relate_url = dict()
                headline = tag.xpath('/@title').get()
                url = "https://vnexpress.vn" + str(tag.xpath('/@href').extract_first())
                relate_url.update({'headline': headline, 'url': url})
                relate_urls.append(relate_url)
            article.update({"related_url": relate_urls})
            # get comment
            id_article = dict()
            objectid = response.xpath('//head/meta[@name="tt_article_id"]/@content').get()
            if objectid is None:
                return 0
            else:
                objectid = objectid
            siteid = response.xpath('//head/meta[@name="tt_site_id"]/@content').get()
            if siteid is None:
                return 0
            else:
                siteid = siteid
            categoryid = response.xpath('//head/meta[@name="tt_category_id"]/@content').get()
            if categoryid is None:
                return 0
            else:
                categoryid = categoryid

            id_article.update({'objectid': objectid, 'siteid': siteid, 'categoryid': categoryid})
            url_like = response.xpath('//meta[@name="its_url"]/@content').get()
            if url_like is not None:
                # get total like
                like_request = "https://www.facebook.com/plugins/like.php?href=" + url_like + "&layout=button_count"
                yield scrapy.Request(like_request, callback=self.parse_like, meta={'article': article, 'id_article': id_article})
            else:
                pass
Example #3
0
    def parse_article(self, response):
        article = {}

        # get ld_json
        try:
            ld_json = response.xpath(
                "//script[contains(text(),'NewsArticle')]/text()").get()
            ld_json_dict = json.loads(ld_json)
            ld_json_dict = time.timestamp_converter(ld_json_dict)
            article.update(ld_json_dict)
        except:
            pass

        # get meta
        elems = {
            'meta-description':
            response.xpath("//meta[@name='description']/@content").get(),
            'meta-keywords':
            response.xpath("//meta[@name='keywords']/@content").get(),
            'meta-title':
            response.xpath("//meta[@name='title']/@content").get(),
            'meta-copyright':
            response.xpath("//meta[@name='copyright']/@content").get(),
            'meta-author':
            response.xpath("//meta[@name='author']/@content").get(),
            'language':
            response.xpath(
                '//meta[@http-equiv = "content-language"]/@content').get(),
            'geo.placename':
            response.xpath('//meta[@name = "geo.placename"]/@content').get(),
            'geo.position':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'geo.region':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-article:author':
            response.xpath(
                "//meta[@property='article:author']/@content").get(),
            'meta-article:publisher':
            response.xpath(
                "//meta[@property='article:publisher']/@content").get(),
            'category':
            response.xpath(
                '//a[@class = "breadcrumbitem1"][contains(@href, "htm")]/span/text()'
            ).get(),
            'organization':
            'dân trí',
            'url':
            response.url,
            'related_urls':
            response.xpath(
                '//div[@class = "article-oldnew"]//div/div[@class = "article-oldnew-img"]/a/@href'
            ).getall()
        }
        article.update(elems)

        # get content
        content = ''
        for text in response.xpath(
                '//*[@id="divNewsContent"]/p/text()').getall():
            content += text.strip()
        for text in response.xpath(
                '//*[@class = "detail-content"]/p/text()').getall():
            content += text.strip()
        for text in response.xpath(
                '//div[@class="e-body"]//p/text()').getall():
            content += text.strip()
        article.update({'content': content})

        word_count = len(content.split())
        article.update({'word_count': word_count})

        # get image url
        images = {}
        index1 = index2 = 0
        for index1, src in enumerate(
                response.xpath('//*[@id="divNewsContent"]//img/@src').getall(),
                1):
            images.update({'image' + str(index1): src})
        for index2, src in enumerate(
                response.xpath(
                    '//*[@class = "detail-content"]//img/@src').getall(),
                index1 + 1):
            images.update({'image' + str(index2): src})
        for index3, src in enumerate(
                response.xpath(
                    '//div[@class="e-body"]//figure[contains(@class,"image")]//@src'
                ).getall(), index2 + 1):
            images.update({'image' + str(index3): src})

        article.update({'image-urls': images})

        # get hashtags
        hashtags = {}
        for index, href in enumerate(
                response.xpath(
                    '//span[@class = "news-tags-item"]/a/@href').getall(), 1):
            hashtags.update({'tag' + str(index): href})
        article.update({'hash-tags': hashtags})

        # get video url
        videos = {}
        for index, src in enumerate(
                response.xpath(
                    '//div[@class="e-body"]/figure[@class = "video"]//@data-src'
                ).getall(), 1):
            videos.update({'video' + str(index): "vcdn.dantri.com.vn/" + src})
        article.update(videos)

        # get likes
        id = response.xpath('//*[@id="hdNewsId"]/@value').get()
        if id is not None:
            like_request = "https://www.facebook.com/v2.3/plugins/like.php?action=like&app_id=164035690775918&channel=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df31c1be4fdc1a28%26domain%3Ddantri.com.vn%26origin%3Dhttps%253A%252F%252Fdantri.com.vn%252Ff3a046e102e74f4%26relation%3Dparent.parent&container_width=0&href=https%3A%2F%2Fdantri.com.vn%2Fnews-" + \
                id+".htm&layout=button_count&locale=vi_VN&sdk=joey&share=false&show_faces=false&size=small"
        else:
            id = response.xpath('//*[@id="hidDistID"]/@value').get()
            if id is not None:
                like_request = "https://www.facebook.com/plugins/like.php?href="+response.url + \
                    "&send=false&share=true&layout=standard&width=450&show_faces=false&action=like&colorscheme=light&font&height=35&"
            else:
                pv1 = response.url.find('.htm')
                pv2 = response.url.find('-', pv1 - 20) + 1
                id = response.url[pv2:pv1]
                like_request = "https://www.facebook.com/v2.3/plugins/like.php?action=like&app_id=164035690775918&channel=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df322cc0314d7894%26domain%3Ddantri.com.vn%26origin%3Dhttps%253A%252F%252Fdantri.com.vn%252Ffe7c5846d65f58%26relation%3Dparent.parent&container_width=0&href=https%3A%2F%2Fdantri.com.vn%2Fnews-" + \
                    id+".htm&layout=button_count&locale=vi_VN&sdk=joey&share=false&show_faces=false"
        yield scrapy.Request(like_request,
                             callback=self.parse_likes,
                             meta={
                                 'article': article,
                                 'id': id
                             })
Example #4
0
    def parse_article(self, response):
        atc_type = response.meta['atc_type']

        article = {}

        # get ld_json
        if atc_type == 'normal':
            ld_json = response.xpath(
                '//*[@id="Head1"]//script[contains(text(),"NewsArticle")]/text()'
            ).get()
            ld_json_dict = json.loads(ld_json)
            ld_json_dict = time.timestamp_converter(ld_json_dict)
            article.update(ld_json_dict)

        try:
            cate_json = cate = response.xpath(
                '//script[contains(text(), "BreadcrumbList")]/text()').get(
                ).strip()
            cate_json = json.loads(cate_json)
            category = cate_json.get('itemListElement')[1].get('item').get(
                'name')
            article.update({'category': category})
        except:
            pass

        # get meta elements
        elems = {
            'meta-description':
            response.xpath("//meta[@name='description']/@content").get(),
            'meta-keywords':
            response.xpath("//meta[@name='keywords']/@content").get(),
            'meta-title':
            response.xpath("//meta[@name='title']/@content").get(),
            'meta-copyright':
            response.xpath("//meta[@name='copyright']/@content").get(),
            'meta-author':
            response.xpath("//meta[@name='author']/@content").get(),
            'language':
            response.xpath(
                '//meta[@http-equiv = "content-language"]/@content').get(),
            'geo.placename':
            response.xpath('//meta[@name = "geo.placename"]/@content').get(),
            'geo.position':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'geo.region':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-article:author':
            response.xpath(
                "//meta[@property='article:author']/@content").get(),
            'meta-article:publisher':
            response.xpath(
                "//meta[@property='article:publisher']/@content").get(),
            'organization':
            'soha',
            'url':
            response.url,
            # 'related_urls': response.xpath('//div[@class = "article-oldnew"]//div/div[@class = "article-oldnew-img"]/a/@href').getall()
        }
        article.update(elems)

        # get content
        content = ''
        for text in response.xpath(
                '//div[@class = "clearfix news-content"]/p/text()').getall():
            content += text
        article.update({'content': content})

        word_count = len(content.split())
        article.update({'word_count': word_count})

        # get image url
        images = {}
        for index, src in enumerate(
                response.xpath(
                    '//div[@class = "clearfix news-content"]/div[@type = "Photo"]//@src'
                ).getall(), 1):
            images.update({'image' + str(index): src})
        article.update({'image-urls': images})

        # get likes,comments
        yield scrapy.Request(
            "https://sharefb.cnnd.vn/?urls=" + response.url,
            callback=self.parse_interations,
            headers={
                'Accept':
                'application/json, text/javascript, */*; q=0.01',
                'Origin':
                'https://soha.vn',
                'Referer':
                response.url,
                'Sec-Fetch-Mode':
                'cors',
                'User-Agent':
                'Mozilla/5.0 (Windows 10 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
            },
            meta={
                'article': article,
                'atc_type': atc_type
            })
Example #5
0
 def parse_item(self, response):
     article = dict()
     # get title, link
     title = response.xpath(
         '//div[@class="article"]/h1[@class="article__header"]/text()'
     ).extract_first()
     if title is not None:
         # get ld_json
         try:
             ld_json = response.xpath(
                 "//script[@type='application/ld+json'][1]/text()").get()
             ld_json = json.loads(ld_json)
             ld_json = time.timestamp_converter(ld_json)
             article.update(ld_json)
         except:
             pass
         # get meta
         article.update({
             'type':
             response.xpath(
                 "//head/meta[@property='og:type']/@content").get()
         })
         article.update({
             'description':
             response.xpath(
                 "//head/meta[@name='description']/@content").get()
         })
         article.update({
             'keywords':
             response.xpath("//head/meta[@name='keywords']/@content").get()
         })
         article.update({
             'category':
             response.xpath(
                 "//head/meta[@property='article:section']/@content").get()
         })
         article.update({
             'copyright':
             response.xpath(
                 "//head/meta[@name='copyright']/@content").get()
         })
         article.update({
             'Language':
             response.xpath("//head/meta[@name='Language']/@content").get()
         })
         article.update({
             'geo_place_name':
             response.xpath(
                 "//meta[@name = 'geo.placename']/@content").get()
         })
         article.update({
             'geo_region':
             response.xpath("//meta[@name = 'geo.region']/@content").get()
         })
         article.update({
             'geo_position':
             response.xpath(
                 "//meta[@name = 'geo.position']/@content").get()
         })
         article.update({'organization': 'Báo mới'})
         link = response.url
         article.update({'title': title, 'link': link})
         # author, content, word_count
         content = ''
         author = ''
         for text in response.xpath(
                 '(//div[@id="ArticleContent"]/p[@class="t-j"]/span/text())|(//div[@class="article__body"]/p['
                 '@class="body-text body-author"]/strong/text())|(//p[@class="body-text body-author"]/strong/text())'
         ).getall():
             author += text.strip()
         article.update({'author': author})
         for text in response.xpath(
                 '(//div[@id="ArticleContent"]/p[@class="t-j"]/text())|(//div[@class="article__body"]/p['
                 '@class="body-text"]/text())|(//div[@class="article__sapo"]/text())'
         ).getall():
             content += text.strip()
         article.update({'content_article': content})
         word_count = len(content.split())
         article.update({'word_count': word_count})
         # get image
         thumbnail = response.xpath(
             '//p[@class="body-image"]/img/@src').getall()
         article.update({'thumbnail': thumbnail})
         # get related_url
         relate_url = []
         htags = response.xpath(
             '//div[@data-track="detail|related"]/div/h4')
         for tag in htags:
             relate_urls = {}
             headline = tag.xpath('a/@title').get()
             url = str(tag.xpath('a/@href').extract_first())
             relate_urls.update({'headline': headline, 'url': url})
             relate_url.append(relate_urls)
         article.update({"related_url": relate_url})
         self.logger.info("#%d: Scraping %s", self.articleCount,
                          article.get('link'))
         self.articleCount += 1
         yield article
     else:
         pass
Example #6
0
    def parse_item(self, response):
        article = dict()
        date = dict()
        title = response.xpath(
            '//head/meta[@property="og:title"]/@content').extract_first()
        if title is not None:

            date.update({
                'datePublished':
                response.xpath(
                    '//meta[@property="article:published_time"]/@content').get(
                    )
            })
            date.update({
                'dateModified':
                response.xpath(
                    '//meta[@property="article:modified_time"]/@content').get(
                    )
            })
            if date is not None:
                try:
                    date = time.timestamp_converter(date)
                    article.update(date)
                except:
                    pass

            link = response.url
            article.update({'title': title, 'link': link})
            # get meta
            article.update({
                'headline':
                response.xpath('//meta[@itemprop="headline"]/@content').get()
            })
            article.update({
                'type':
                response.xpath("//meta[@property='og:type']/@content").get()
            })
            article.update({
                'description':
                response.xpath("//meta[@name='description']/@content").get()
            })
            article.update({
                'keywords':
                response.xpath("//meta[@name='keywords']/@content").get()
            })
            article.update({
                'category':
                response.xpath(
                    "//meta[@property='article:section']/@content").get()
            })
            article.update({
                'copyright':
                response.xpath("//meta[@name='copyright']/@content").get()
            })
            article.update({
                'language':
                response.xpath("//meta[@name='Language']/@content").get()
            })
            article.update({
                'geo_place_name':
                response.xpath(
                    "//meta[@name = 'geo.placename']/@content").get()
            })
            article.update({
                'geo_region':
                response.xpath("//meta[@name = 'geo.region']/@content").get()
            })
            article.update({
                'geo_position':
                response.xpath(
                    "//meta[@name = 'geo.position']/@content").get()
            })
            article.update({'organization': 'Tuổi trẻ'})

            # author
            content = ''
            author = ''
            for text in response.xpath(
                    '(//div|//p)[contains(@class, "author") or contains(@class, "author_single") or contains(@class,"authorvideo") or contains(@class,"credit-text")]//text()'
            ).getall():
                author += text.strip()
            article.update({'author': author})
            for text in response.xpath(
                    '//div[contains(@id,"main-detail-body") or contains(@class,"sp-detail-content") or contains(@class,"fck")]/p//text()'
            ).getall():
                content += text.strip()
            article.update({'content_article': content})
            word_count = len(content.split())
            article.update({'word_count': word_count})
            # get thumbnail
            thumbnail = response.xpath(
                '(//div[@type="Photo"]/div/a/img/@src)|(//div[@type="Photo"]/div/img/@src)|(//td/a/img/@src)'
            ).getall()
            article.update({'thumbnail': thumbnail})
            # get images
            images = []
            image = dict()
            image.update({
                'url':
                response.xpath('//meta[@property="og:image"]/@content').get()
            })
            image.update({
                'alt':
                response.xpath(
                    '//meta[@property="og:image:alt"]/@content').get()
            })
            image.update({
                'width':
                response.xpath(
                    '//meta[@property="og:image:width"]/@content').get()
            })
            image.update({
                'height':
                response.xpath(
                    '//meta[@property="og:image:height"]/@content').get()
            })
            images.append(image)
            article.update({'image': images})
            # get relate_url
            relate_url = []
            htags = response.xpath(
                '//ul[@class="list-news"]/li/div[@class="name-title"]')
            for tag in htags:
                relate_urls = {}
                headline = tag.xpath('a/text()').get()
                url = "https://tuoitre.vn" + str(
                    tag.xpath('a/@href').extract_first())
                relate_urls.update({'headline': headline, 'url': url})
                relate_url.append(relate_urls)
            article.update({"related_url": relate_url})
            # get inf cmt
            objectid = response.xpath(
                '//div[@id="tagandnetwork"]/div[@class="tagandtopicandbanner"]/section/@data-objectid'
            ).get()
            if objectid is None:
                return 0
            else:
                objectid = objectid
            datasort = response.xpath(
                '//div[@id="tagandnetwork"]/div[@class="tagandtopicandbanner"]/section/@data-sort'
            ).get()
            if datasort is None:
                return 0
            else:
                datasort = datasort

            pagesize = response.xpath(
                '//div[@id="tagandnetwork"]/div[@class="tagandtopicandbanner"]/section/@data-pagesize'
            ).get()
            if pagesize is None:
                return 0
            else:
                pagesize = pagesize
            objecttype = response.xpath(
                '//div[@id="tagandnetwork"]/div[@class="tagandtopicandbanner"]/section/@data-objecttype'
            ).get()
            if objecttype is None:
                return 0
            else:
                objecttype = objecttype
            id_article = dict()
            id_article.update({
                'objectid': objectid,
                'datasort': datasort,
                'pagesize': pagesize,
                'objecttype': objecttype
            })
            # get total likes
            total_like = "https://s1.tuoitre.vn/count-object.htm?newsId=" + objectid

            yield scrapy.Request(total_like,
                                 callback=self.parse_like,
                                 headers={
                                     'Accept': '*/*',
                                     'Origin': 'https://tuoitre.vn',
                                     'Referer': response.url,
                                     'Sec-Fetch-Mode': 'cors',
                                 },
                                 meta={
                                     'article': article,
                                     'id_article': id_article
                                 })
Example #7
0
    def parse_article(self, response):
        article = {}

        try:
            ld_json = response.xpath(
                "//script[contains(text(),'NewsArticle')]/text()").get()
            ld_json_dict = json.loads(ld_json)
            ld_json_dict = time.timestamp_converter(ld_json_dict)
            article.update(ld_json_dict)
        except:
            pass

        # get meta
        elems = {
            'meta-description': response.xpath("//meta[@name='description']/@content").get(),
            'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(),
            'meta-title': response.xpath("//meta[@name='title']/@content").get(),
            'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(),
            'meta-author': response.xpath("//meta[@name='author']/@content").get(),
            'language': response.xpath('//meta[@http-equiv = "content-language"]/@content').get(),
            'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(),
            'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-article:author': response.xpath("//meta[@property='article:author']/@content").get(),
            'meta-article:publisher': response.xpath("//meta[@property='article:publisher']/@content").get(),
            'category': response.xpath('//li[@class = "kmli active"]/a/text()').get(),
            'organization': 'kênh 14',
            'related_urls': response.xpath('//div[@class = "kds-same-category clearfix"]//div[@class = "rowccm"]/li/a/@href').getall(),
            'url': response.url
        }
        article.update(elems)

        # get content
        content = ''
        for text in response.xpath('//div[@class = "knc-content"]//p//text()').getall():
            content += text.strip()
        article.update({'content': content})

        word_count = len(content.split())
        article.update({'word_count': word_count})

        # get image url
        images = {}
        for index, src in enumerate(response.xpath('//div[@class = "knc-content"]//div[@type = "Photo"]//@src').getall(), 1):
            images.update({'image' + str(index): src})
        article.update({'image-urls': images})

        # get video url
        videos = {}
        for index, src in enumerate(response.xpath('//div[@type="VideoStream"]/@data-src').getall(), 1):
            videos.update({'video'+str(index): src})
        article.update({'video-urls': videos})

        # get hashtags
        hashtags = {}
        for index, href in enumerate(response.xpath('//ul[@class="knt-list"]/li//@href').getall(), 1):
            hashtags.update({'tag'+str(index): href})
        article.update({'hash-tags': hashtags})

        comments_paras = response.xpath(
            '//script[@type="text/javascript"][contains(text(),"comment")]/text()').get()
        pv0 = comments_paras.find("MINGID_IFRAME_FUNC.mingidGenIfram")
        pv1 = comments_paras.find("(", pv0)
        pv2 = comments_paras.find(")", pv1)+1
        paras = comments_paras[pv1:pv2]
        # danh sach parameters de lay request comment
        para_list = ast.literal_eval(paras)
        para_list = list(para_list)

        # get interactions
        inter_request = "https://sharefb.cnnd.vn/?urls=" + response.url
        yield scrapy.Request(inter_request, callback=self.get_inter, meta={'article': article, 'paras': para_list}, headers={
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Origin': 'https://soha.vn',
            'Referer': 'https://soha.vn/chiu-suc-ep-khong-lo-tu-my-tq-ngam-ngui-buong-tay-bo-roi-du-an-dau-mo-5-ti-usd-voi-doi-tac-lau-nam-20191007161429421.htm',
            'Sec-Fetch-Mode': 'cors',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
        })
Example #8
0
    def parse_article(self, response):
        article = {}

        # get ld_json
        try:
            ld_json = response.xpath(
                '//script[contains(text(),"Article")]/text()').get()
            ld_json_dict = json.loads(ld_json)
            ld_json_dict = time.timestamp_converter(ld_json_dict)
            article.update(ld_json_dict)
        except:
            pass

        # get meta
        elems = {
            'meta-description': response.xpath("//meta[@name='description']/@content").get(),
            'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(),
            'meta-title': response.xpath("//meta[@name='title']/@content").get(),
            'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(),
            'meta-author': response.xpath("//meta[@name='author']/@content").get(),
            'language': response.xpath('//meta[@http-equiv = "content-language"]/@content').get(),
            'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(),
            'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-article:author': response.xpath("//meta[@property='article:author']/@content").get(),
            'meta-article:publisher': response.xpath("//meta[@property='article:publisher']/@content").get(),
            'organization': 'techtalk',
            'url': response.url,
            # 'related_urls': response.xpath('//div[@class = "article-oldnew"]//div/div[@class = "article-oldnew-img"]/a/@href').getall()
        }
        article.update(elems)
        try:
            article.update({'category': response.xpath(
                '//a[@class = "entry-crumb"]')[1].xpath('./span/text()').get()})
        except:
            pass

        # get content
        content = ''
        for text in response.xpath('//div[@class = "td-post-content"]//p/text()').getall():
            content += text.strip()
        article.update({'content': content})
        word_count = len(content.split())
        article.update({'word_count': word_count})

        # get image url
        images = {}
        for index, src in enumerate(response.xpath('//div[@class="td-post-content"]//*[contains(@class,"image") or contains(@class,"Image")]//@src').getall(), 1):
            images.update({'image' + str(index): src})
        article.update({'image-urls': images})

        # get video url
        videos = {}
        for index, src in enumerate(response.xpath('//div[@class="td-post-content"]//iframe/@src').getall(), 1):
            videos.update({'video' + str(index): src})
        article.update({'video urls': videos})

        # get hashtags
        hashtags = {}
        for index, href in enumerate(response.xpath('//ul[@class = "td-tags td-post-small-box clearfix"]//@href').getall(), 1):
            hashtags.update({'tag'+str(index): href})
        article.update({'hash-tags': hashtags})

        # get views
        views = response.xpath('//div[@class="td-post-views"]//text()').get()
        article.update({'views': views})

        # get likes
        like_request = "https://www.facebook.com/plugins/like.php?href="+response.url + \
            "&layout=button_count&show_faces=false&width=105&action=like&colorscheme=light&height=21"
        yield scrapy.Request(like_request, callback=self.parse_likes, meta={'article': article, 'url': response.url})
Example #9
0
    def parse_item(self, response):
        article = dict()
        image = dict()
        images = []
        try:
            ld_json = response.xpath(
                '//script[contains(text(),"NewsArticle")]/text()').get()
            if ld_json is None:
                return 0
            else:
                ld_json = ld_json
                ld_json = json.loads(ld_json)
                ld_json = time.timestamp_converter(ld_json)
            article.update(ld_json)
        except ValueError:
            return 0
        title = response.xpath('//meta[@property="og:title"]/@content').get()
        link = response.url
        article.update({'title': title, 'link': link})
        # get meta
        article.update({
            'type':
            response.xpath("//head/meta[@property='og:type']/@content").get()
        })
        article.update({
            'description':
            response.xpath("//head/meta[@name='description']/@content").get()
        })
        article.update({
            'keywords':
            response.xpath("//meta[@name='keywords']/@content").get()
        })
        article.update({
            'category':
            response.xpath(
                "//meta[@property='article:section']/@content").get()
        })
        article.update({
            'copyright':
            response.xpath("//meta[@name='copyright']/@content").get()
        })
        article.update({
            'language':
            response.xpath("//meta[@name='Language']/@content").get()
        })
        article.update({
            'geo_place_name':
            response.xpath("//meta[@name = 'geo.placename']/@content").get()
        })
        article.update({
            'geo_region':
            response.xpath("//meta[@name = 'geo.region']/@content").get()
        })
        article.update({
            'geo_position':
            response.xpath("//meta[@name = 'geo.position']/@content").get()
        })
        article.update({'organization': 'Afamily'})
        # author, content, title
        content = ''
        title = response.xpath(
            '//div[@class="w700 mr-40 fl"]/h1/text()').getall()
        article.update({'title': title})
        for text in response.xpath(
                '(//div[@id="af-detail-content"]/p/text())|(//div[@data-role="content"]/div/span/text())|(//p['
                '@class="MsoNormal"]/text())|(//*[@id="af-detail-content"]/div/div/div/text())|(//*['
                '@id="af-detail-content"]/div/div/div/span/text())|(//*[@id="af-detail-content"]/div/div/p/text())'
        ).getall():
            content += text.strip()
        article.update({'content_article': content})
        if content is not None:
            word_count = len(content.split())
            article.update({'word_count': word_count})
        else:
            word_count = -1
            article.update({'word_count': word_count})
        url_image = response.xpath(
            '//meta[@property="og:image"]/@content').get()
        if url_image is not None:
            image.update({
                'url':
                response.xpath('//meta[@property="og:image"]/@content').get()
            })
            image.update({
                'alt':
                response.xpath(
                    '//meta[@property="og:image:alt"]/@content').get()
            })
            image.update({
                'width':
                response.xpath(
                    '//meta[@property="og:image:width"]/@content').get()
            })
            image.update({
                'height':
                response.xpath(
                    '//meta[@property="og:image:height"]/@content').get()
            })
            images.append(image)
            article.update({'image': images})

        # get thumbnail
        thumbnail = response.xpath(
            '(//div[@class="VCSortableInPreviewMode LayoutAlbumWrapper alignJustify noCaption"]/div/div/div/figure/a/@href)|(//div[@type="Photo"]/div/a/img/@src)|(//figure[@type="Photo"]/div/a/img/@src)|(//a[@class="detail-img-lightbox"]/img/@src)'
        ).getall()
        article.update({'thumbnail': thumbnail})
        with open("body.html", "wb") as f:
            f.write(response.body)

        # get likes,comments
        yield scrapy.Request(
            'http://sharefb.cnnd.vn/?urls=' + response.url,
            callback=self.parse_interations,
            headers={
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'Origin': 'https://afamily.vn',
                'Sec-Fetch-Mode': 'cors',
                'Referer': article.get('link')
            },
            meta={'article': article})
Example #10
0
    def parse_article(self, response):
        article = {}

        # get ld_json
        try:
            ld_json = response.xpath(
                "//script[contains(text(),'NewsArticle')]/text()").get()
            ld_json_dict = json.loads(ld_json)[0]
            ld_json_dict = time.timestamp_converter(ld_json_dict)
            article.update(ld_json_dict)
        except:
            pass
        # get meta elements
        elems = {
            'meta-description':
            response.xpath("//meta[@name='description']/@content").get(),
            'meta-keywords':
            response.xpath("//meta[@name='keywords']/@content").get(),
            'meta-title':
            response.xpath("//meta[@name='title']/@content").get(),
            'meta-copyright':
            response.xpath("//meta[@name='copyright']/@content").get(),
            'meta-author':
            response.xpath("//meta[@name='author']/@content").get(),
            'language':
            response.xpath(
                '//meta[@http-equiv = "content-language"]/@content').get(),
            'geo.placename':
            response.xpath('//meta[@name = "geo.placename"]/@content').get(),
            'geo.position':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'geo.region':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-article:author':
            response.xpath(
                "//meta[@property='article:author']/@content").get(),
            'meta-article:publisher':
            response.xpath(
                "//meta[@property='article:publisher']/@content").get(),
            'category':
            response.xpath('//h2[@class = "headline"]/a/text()').get(),
            'organization':
            'thanh niên',
            'url':
            response.url,
            # 'related_urls': response.xpath('//div[@class = "article-oldnew"]//div/div[@class = "article-oldnew-img"]/a/@href').getall()
        }
        article.update(elems)

        # get video url
        videos = []

        try:
            url_finder = response.xpath(
                '//figure[@itemprop = "associatedMedia"]/script/text()').get()
            pv1 = url_finder.find("src")
            pv2 = url_finder[pv1:].find('"') + pv1 + 1
            pv3 = url_finder[pv2:].find('"') + pv2
            video_url = url_finder[pv2:pv3]
            videos.append(video_url)
        except:
            pass

        video_url = response.xpath(
            '//table[@class="video"]//@data-video-src').get()
        videos.append(video_url)

        article.update({'videos-url': videos})

        # get content
        content = ''
        for text in response.xpath(
                '//div[@id="abody"]//p[contains(@style,"margin")or contains(@style,"text")]/text()'
        ).getall():
            content += text.strip()
        for text in response.xpath('//*[@id="abody"]//div/text()').getall():
            content += text.strip()
        article.update({'content': content})

        word_count = len(content.split())
        article.update({'word_count': word_count})

        # get image url
        images = {}
        ava_index = 0
        for ava_index, src in enumerate(
                response.xpath(
                    '//*[@id="contentAvatar"]//a/img/@src').getall(), 1):
            images.update({'image' + str(ava_index): src})
        index = ava_index + 1
        for index, src in enumerate(
                response.xpath('//*[@class="imagefull"]//@data-src').getall(),
                index):
            images.update({'image' + str(index): src})

        article.update({'image-urls': images})

        # get comments
        comments_count = response.xpath('//*[@id="commentcount"]/text()').get()
        article.update({'comments-count': comments_count})
        comments = []

        for comment in response.xpath('//*[@id="commentcontainer"]/div'):
            primary_comment = comment.xpath(
                './div[@class = "primary-comment"]')
            primary_ava = primary_comment.xpath(
                './/div[@class = "ava"]/img/@data-src').get()
            primary_user = primary_comment.xpath(
                './/div[@class = "data"]/div[@class = "meta"]/h4/text()').get(
                )
            if primary_user is not None:
                primary_user = primary_user.strip()
            primary_geo = primary_comment.xpath(
                './/div[@class = "data"]/div[@class = "meta"]/time/text()'
            ).get()
            if primary_geo is not None:
                primary_geo = primary_geo.strip()
            primary_content = primary_comment.xpath(
                './/div[@class = "data"]/div[@class = "comment"]/text()').get(
                )
            if primary_content is not None:
                primary_content = primary_content.strip()
            primary_time = primary_comment.xpath(
                './/div[@class = "meta"]/time/@rel').get()
            primary_likes = primary_comment.xpath(
                './/div[@class = "data"]/div[@class = "reply"]//a[@class = "likebtn"]//text()'
            ).get()
            if primary_likes is not None:
                primary_likes = primary_likes.strip()
                strings = [s for s in primary_likes.split() if s.isdigit()]
                if len(strings) != 0:
                    primary_likes = strings[0]
                else:
                    primary_likes = '0'

            secondary_dict = []
            counter = 0
            for counter, reply in enumerate(
                    comment.xpath('.//div[@class = "secondary-comment"]'), 1):
                secondary_ava = reply.xpath(
                    './/div[@class = "ava"]/img/@data-src').get()
                secondary_user = reply.xpath(
                    './/div[@class = "data"]/div[@class = "meta"]/h4/text()'
                ).get()
                if secondary_user is not None:
                    secondary_user = secondary_user.strip()
                secondary_geo = reply.xpath(
                    './/div[@class = "data"]/div[@class = "meta"]/time/text()'
                ).get()
                if secondary_geo is not None:
                    secondary_geo = secondary_geo.strip()
                secondary_content = reply.xpath(
                    './/div[@class = "data"]/div[@class = "comment"]/text()'
                ).get()
                if secondary_content is not None:
                    secondary_content = secondary_content.strip()
                secondary_time = reply.xpath(
                    './/div[@class = "meta"]/time/@rel').get()
                secondary_likes = reply.xpath(
                    './/div[@class = "data"]/div[@class = "reply"]//a[@class = "likebtn"]//text()'
                ).get()
                if secondary_likes is not None:
                    secondary_likes = secondary_likes.strip()
                    strings = [
                        s for s in secondary_likes.split() if s.isdigit()
                    ]
                    if len(strings) != 0:
                        secondary_likes = strings[0]
                    else:
                        secondary_likes = '0'

                secondary_dict.append({
                    'SenderAvatar': secondary_ava,
                    'SenderFullName': secondary_user,
                    'PublishedGeo': secondary_geo,
                    'CommentContent': secondary_content,
                    'CreatedDate': secondary_time,
                    'Liked': secondary_likes,
                    'Replies-count': 0,
                    'Replies': []
                })

            comments.append({
                'SenderAvatar': primary_ava,
                'SenderFullName': primary_user,
                'PublishedGeo': primary_geo,
                'CommentContent': primary_content,
                'CreatedDate': primary_time,
                'Liked': primary_likes,
                'Replies-count': counter,
                'Replies': secondary_dict if counter != 0 else None
            })
        article.update({'comments': comments})

        # get likes
        url = response.xpath(
            '//li[@class = "zalo-share-button"]/@data-href').get()
        if url is None:
            url = response.xpath('//li[@class="fb-share"]/a/@href').get()
        url = url.replace("=", "%3D")
        url = url.replace("/", "%2F")
        url = url.replace(":", "%3A")

        like_request = "https://www.facebook.com/v3.1/plugins/like.php?action=like&app_id=288067561729014&channel=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df1b1dac16a53484%26domain%3Dthanhnien.vn%26origin%3Dhttps%253A%252F%252Fthanhnien.vn%252Ff20b42488425504%26relation%3Dparent.parent&container_width=0&href=" + \
            url+"&layout=button_count&locale=en_US&sdk=joey&share=true&show_faces=false&size=large"
        yield scrapy.Request(like_request,
                             callback=self.parse_likes,
                             meta={'article': article})
Example #11
0
    def parse_item(self, response):
        article = dict()
        title_arr = response.xpath('//h1[@class="title"]/text()').get()
        if title_arr is not None:
            title = title_arr.strip()
            # get ld_json
            try:
                ld_json = response.xpath('//script[contains(text(),"NewsArticle")]/text()').get()
                ld_json = ld_json
                ld_json = json.loads(ld_json)
                ld_json = time.timestamp_converter(ld_json)
                article.update(ld_json)
            except:
                pass
            # get headline
            article.update({'headline': response.xpath("//meta[@itemprop='headline']/@content").get()})
            # get thumbnail
            image_list = response.xpath('//div/img/@src').getall()
            image_str = str(image_list)
            article.update({'thumbnail': image_str})
            # get meta
            article.update({'type': response.xpath("//head/meta[@property='og:type']/@content").get()})
            article.update({'description': response.xpath("//meta[@name='description']/@content").get()})
            article.update({'keywords': response.xpath("//meta[@name='keywords']/@content").get()})
            article.update({'category': response.xpath("//meta[@property='article:section']/@content").get()})
            article.update({'copyright': response.xpath("//meta[@name='copyright']/@content").get()})
            article.update({'author': response.xpath("//meta[@name='author']/@content").get()})
            article.update({'language': response.xpath("//meta[@name='Language']/@content").get()})
            article.update({'geo_place_name': response.xpath("//meta[@name = 'geo.placename']/@content").get()})
            article.update({'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get()})
            article.update({'geo_position': response.xpath("//meta[@name = 'geo.position']/@content").get()})
            article.update({'organization': 'Cafef'})
            # get title, link
            link = response.url
            article.update({'title': title, 'link': link})
            article.update({'author': response.xpath("//p[@class='author']/text()").get()})
            # get contents
            content = ''
            for text in response.xpath(
                    '(//div[@class="contentdetail"]/span/p/text())|(//div[@class="companyIntro"]/text())').getall():
                content += text.strip()
            article.update({'content_article': content})
            if content is not None:
                word_count = len(content.split())
                article.update({'word_count': word_count})
            else:
                word_count = -1
                article.update({'word_count': word_count})

            # get likes,comments
            yield scrapy.Request("https://sharefb.cnnd.vn/?urls=" + response.url,
                                 callback=self.parse_interactions,
                                 headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                                          'Origin': 'https://cafef.vn',
                                          'Referer': response.url,
                                          'Sec-Fetch-Mode': 'cors',
                                          },
                                 meta={'article': article})

            # get relate_url
            relate_url = []
            htags = response.xpath('//div[@class="bg-tit-samecate"]/h4')
            for tag in htags:
                relate_urls = {}
                headline = tag.xpath('a/@title').get()
                url = "https://cafef.vn" + str(tag.xpath('a/@href').extract_first())
                relate_urls.update({'headline': headline, 'url': url})
                relate_url.append(relate_urls)
            article.update({"related_url": str(relate_url)})
Example #12
0
    def parse_article(self, response):
        article = {}

        # get ld_json
        try:
            ld_json = response.xpath(
                "//script[contains(text(),'Article')]/text()").get()
            if (ld_json is None):
                ld_json = response.xpath(
                    "//script[contains(text(),'NewsArticle')]/text()").get()
            ld_json_dict = json.loads(ld_json)
            ld_json_dict = time.timestamp_converter(ld_json_dict)
            article.update(ld_json_dict)
        except:
            pass

        # get meta
        elems = {
            'meta-description':
            response.xpath("//meta[@name='description']/@content").get(),
            'meta-keywords':
            response.xpath("//meta[@name='keywords']/@content").get(),
            'meta-title':
            response.xpath("//meta[@name='title']/@content").get(),
            'meta-copyright':
            response.xpath("//meta[@name='copyright']/@content").get(),
            'meta-author':
            response.xpath("//meta[@name='author']/@content").get(),
            'meta-content-language':
            response.xpath(
                '//meta[@name = "content-language"]/@content').get(),
            'meta-geo.placename':
            response.xpath('//meta[@name = "geo.placename"]/@content').get(),
            'meta-geo.position':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-geo.region':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-article:author':
            response.xpath(
                "//meta[@property='article:author']/@content").get(),
            'meta-article:publisher':
            response.xpath(
                "//meta[@property='article:publisher']/@content").get(),
            'url':
            response.url,
            'category':
            'viblo article',
            'organization':
            'viblo',
            'related-urls':
            response.xpath(
                '//div[@class = "related-posts-box"]//div[contains(@class, "post-card__title")]//a/@href'
            ).getall()
        }
        article.update(elems)

        # get hashtags
        article.update({'hash-tags': response.meta['hash-tags']})

        # get views
        views = response.xpath(
            '//div[contains(@data-original-title, "Views:")]/@data-original-title'
        ).get()
        if views is not None:
            strings = [s for s in views.split() if s.isdigit()]
            if len(strings) != 0:
                views = strings[0]
            else:
                views = '0'
            article.update({'view-count': views})

        # get likes/ upvotes counts
        likes = response.xpath(
            '//div[@class = "votes votes--side post-actions__vote mb-1"]/div/text()'
        ).get()
        if likes is not None:
            likes = likes.replace('+', '')
            likes = likes.replace('\n', '')
            likes = likes.strip()
            article.update({'likes-counter': likes})

        # get comments count
        comment_count = response.xpath(
            '//div[@class = "post-meta__item mr-1"]//button[@class = "el-button el-button--text"]/span/text()'
        ).get()
        if comment_count is not None:
            comment_count = comment_count.replace('\n', '').strip()
            article.update({'comments-count': comment_count})
        else:
            article.update({'comments-count': '0'})

        # get content
        content = ''
        for text in response.xpath(
                '//div[contains(@class, "md-contents article-content__body")]//text()'
        ).getall():
            content += text.strip()
        article.update({'content': content})

        word_count = len(content.split())
        article.update({'word_count': word_count})

        # get image url
        images = {}
        for index, src in enumerate(
                response.xpath(
                    '//div[contains(@class, "md-contents article-content__body")]//img/@src'
                ).getall(), 1):
            images.update({'image' + str(index): src})

        article.update({'image-urls': images})

        # get comments
        id = response.url.split('-')
        id = id[len(id) - 1]
        comment_url = "https://viblo.asia/api/posts/" + id + "/comments"
        return scrapy.Request(comment_url,
                              callback=self.parse_comments,
                              meta={'article': article})
Example #13
0
    def parse_article(self, response):
        article = {}

        try:
            # get ld_json
            ld_json = response.xpath(
                '//html/head/script[contains(text(),"NewsArticle")]/text()'
            ).get()
            ld_json = remove_ctrl(ld_json)
            ld_json_dict = json.loads(ld_json)
            ld_json_dict = time.timestamp_converter(ld_json_dict)
            article.update(ld_json_dict)
        except:
            pass

        # get meta
        elems = {
            'meta-description':
            response.xpath("//meta[@name='description']/@content").get(),
            'meta-keywords':
            response.xpath("//meta[@name='keywords']/@content").get(),
            'meta-title':
            response.xpath("//meta[@name='title']/@content").get(),
            'meta-copyright':
            response.xpath("//meta[@name='copyright']/@content").get(),
            'meta-author':
            response.xpath("//meta[@name='author']/@content").get(),
            'language':
            response.xpath(
                '//meta[@http-equiv = "content-language"]/@content').get(),
            'geo.placename':
            response.xpath('//meta[@name = "geo.placename"]/@content').get(),
            'geo.position':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'geo.region':
            response.xpath('//meta[@name = "geo.region"]/@content').get(),
            'meta-article:author':
            response.xpath(
                "//meta[@property='article:author']/@content").get(),
            'meta-article:publisher':
            response.xpath(
                "//meta[@property='article:publisher']/@content").get(),
            'category':
            response.xpath(
                '//li[@class = "f-rsb m-auto nav-item position-relative d-inline-block active"]/a/text()'
            ).get(),
            'organization':
            'người đưa tin',
            'url':
            response.url,
            'related_urls':
            response.xpath(
                '//section[@class = "article-content clearfix"]/following-sibling::section[@class = "row"]//li[@class = "box-news row pb-3 clearfix py-3 border-bottom "]/a/@href'
            ).getall()
        }
        article.update(elems)

        # get content
        content = ''
        for text in response.xpath(
                '/html/body//section[@class = "article-content clearfix"]/article//text()'
        ).getall():
            content += text.strip()
        for text in response.xpath(
                '//div[@class = "box-center"]/p/text()').getall():
            content += text.strip()
        article.update({'content': content})

        word_count = len(content.split())
        article.update({'word_count': word_count})

        # get image url
        images = {}
        type1_index = 0
        for type1_index, src in enumerate(
                response.xpath(
                    '/html/body//section[@class = "article-content clearfix"]//figure[@class = "tplCaption image"]/img/@src'
                ).getall(), 1):
            images.update({'image' + str(type1_index): src})
        type2_index = type1_index + 1
        for type2_index, src in enumerate(
                response.xpath(
                    '//*[contains(@class,"image-full-width") or contains(@class,"box")]/img/@src'
                ).getall(), type2_index):
            images.update({'image' + str(type2_index): src})
        article.update({'image-urls': images})

        url = response.url
        url = url.replace('https://www.nguoiduatin.vn/', '')
        id = response.xpath('//@data-id').get()
        if id is None:
            pv1 = response.url.find('.html')
            pv2 = response.url.find('a', pv1 - 7) + 1
            id = response.url[pv2:pv1]

        # get video urls
        id_finder = response.xpath(
            '//script[contains(@src,"//embed.easyvideo.vn/play")]/@src').get()
        if id_finder is not None:
            easyvideo_id = id_finder.replace('//embed.easyvideo.vn/play', '')
            video_finder = "https://embed.easyvideo.vn/render/" + \
                easyvideo_id+"?targetId=MeCloudLoader_"+easyvideo_id
            yield scrapy.Request(video_finder,
                                 callback=self.parse_video,
                                 meta={
                                     'article': article,
                                     'url': url,
                                     'id': id
                                 })
        else:
            # get likes
            like_request = "https://www.facebook.com/v2.9/plugins/like.php?action=like&app_id=1069396303196363&channel=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df122fdd10517174%26domain%3Dwww.nguoiduatin.vn%26origin%3Dhttps%253A%252F%252Fwww.nguoiduatin.vn%252Ff3f7ea1e941e5e4%26relation%3Dparent.parent&container_width=410&href=https%3A%2F%2Fwww.nguoiduatin.vn%2F" + url + "&layout=button_count&locale=vi_VN&sdk=joey&share=true&size=small"
            yield scrapy.Request(like_request,
                                 callback=self.parse_likes,
                                 meta={
                                     'article': article,
                                     'id': id
                                 })
Example #14
0
    def parse_item(self, response):
        article = dict()
        image = dict()
        images = []
        title = response.xpath('//div[@class="head-article"]/h1/@data-title').get()
        if title is not None:
            # get meta
            article.update({'headline': response.xpath('//meta[@itemprop="headline"]/@content').get()})
            article.update({'datePublished': response.xpath('//time[@itemprop="datePublished"]/@datetime').get()})
            article.update({'dateModified': response.xpath('//time[@itemprop="dateModified"]/@datetime').get()})
            article.update({'publisher': response.xpath('//div[@itemprop="publisher"]/span/text()').get()})
            article.update({'type': response.xpath("//head/meta[@property='og:type']/@content").get()})
            article.update({'description': response.xpath("//head/meta[@name='description']/@content").get()})
            article.update({'keywords': response.xpath("//head/meta[@name='keywords']/@content").get()})
            article.update({'category': response.xpath("//head/meta[@property='article:section']/@content").get()})
            article.update({'copyright': response.xpath("//head/meta[@name='copyright']/@content").get()})
            article.update({'Language': response.xpath("//head/meta[@name='Language']/@content").get()})
            article.update({'geo_place_name': response.xpath("//meta[@name = 'geo.placename']/@content").get()})
            article.update({'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get()})
            article.update({'geo_position': response.xpath("//meta[@name = 'geo.position']/@content").get()})
            article.update({'organization': 'Saostar'})
            article = time.timestamp_converter(article)
            url_img = response.xpath('//meta[@property="og:image"]/@content').get()
            if url_img is not None:
                image.update({'url': response.xpath('//meta[@property="og:image"]/@content').get()})
                image.update({'alt': response.xpath('//meta[@property="og:image:alt"]/@content').get()})
                image.update({'width': response.xpath('//meta[@property="og:image:width"]/@content').get()})
                image.update({'height': response.xpath('//meta[@property="og:image:height"]/@content').get()})
                images.append(image)
                article.update({'image': images})
            # title, link, author, content
            link = response.url
            article.update({'title': title, 'link': link})
            article.update({'author': response.xpath("//span[@class='writer']/text()").get()})
            content = ''
            for text in response.xpath('(//div[@id="content_detail"]/p/text())|'
                                       '(//span['
                                       '@class="wp-caption-text"]/text())').getall():
                content += text.strip()
            article.update({'content_article': content})
            if content is not None:
                word_count = len(content.split())
                article.update({'word_count': word_count})
            else:
                word_count = -1
                article.update({'word_count': word_count})

            # get image
            thumbnail = response.xpath('(//p/a/img/@src)|(//strong/a/img/@src)|(//div/a/img/@src)').getall()
            if thumbnail is not []:
                article.update({'thumbnail': thumbnail})
            # get relate_url
            relate_url = []
            htags = response.xpath(
                '(//div[@class="content-block"]/div[@class="post mt15 js-post "]/h4[@class="post-title pl15 dis-inline-block"])|(//h3[@class="post-title mb10"])')
            for tag in htags:
                relate_urls = {}
                headline = tag.xpath('a/text()').get()
                if headline is not []:
                    url = str(tag.xpath('a/@href').extract_first())
                    relate_urls.update({'headline': headline, 'url': url})
                    relate_url.append(relate_urls)
                article.update({"related_url": relate_url})
            # get interactions

            url = response.xpath('//meta[@itemprop="url"]/@content').get()
            like_request = "https://www.facebook.com/v2.8/plugins/like.php?action=like&channel=https%3A%2F%2Fstaticxx" \
                           ".facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df37cc7337bc398%26domain" \
                           "%3Dsaostar.vn%26origin%3Dhttps%253A%252F%252Fsaostar.vn%252Ff3ecd646e17999%26relation" \
                           "%3Dparent.parent&container_width=0&href=" + url \
                           + "&layout=button_count&locale=vi_VN&sdk=joey&share=true&show_faces=false"
            yield scrapy.Request(like_request, callback=self.parse_like, meta={'data': article})
        else:
            pass
Example #15
0
    def parse_item(self, response):
        article = dict()
        title = response.xpath(
            '(//h1[@class="title_detail"]/text())|(//div[@class="infomationdetail clearfix"]/h1/text())'
        ).get()
        if title is not None:
            # get ld_json
            ld_json = response.xpath(
                '//head/script[@type="application/ld+json"]/text()').get()
            if ld_json is not None:
                try:
                    ld_json = json.loads(ld_json)
                    ld_json = time.timestamp_converter(ld_json)
                    article.update(ld_json)
                except ValueError:
                    pass
            if 'dateModified' in article.keys():
                dateModified = response.xpath(
                    '//meta[@name="pubdate"]/@content').get()
                article.update(
                    {'dateModified': time.Vnex_timestamp(dateModified)})
            if 'datePublished' in article.keys():
                datePublished = response.xpath(
                    '//meta[@name="lastmod"]/@content').get()
                article.update(
                    {'datePublished': time.Vnex_timestamp(datePublished)})
            # get meta
            article.update({
                'type':
                response.xpath(
                    "//head/meta[@property='og:type']/@content").get()
            })
            article.update({
                'description':
                response.xpath(
                    "//head/meta[@name='description']/@content").get()
            })
            article.update({
                'keywords':
                response.xpath("//head/meta[@name='keywords']/@content").get()
            })
            article.update({
                'category':
                response.xpath(
                    "//head/meta[@property='article:section']/@content").get()
            })
            article.update({
                'copyright':
                response.xpath(
                    "//head/meta[@name='copyright']/@content").get()
            })
            article.update({
                'language':
                response.xpath("//head/meta[@name='Language']/@content").get()
            })
            article.update({
                'geo_place_name':
                response.xpath(
                    "//meta[@name = 'geo.placename']/@content").get()
            })
            article.update({
                'geo_region':
                response.xpath("//meta[@name = 'geo.region']/@content").get()
            })
            article.update({
                'geo_position':
                response.xpath(
                    "//meta[@name = 'geo.position']/@content").get()
            })
            article.update({'organization': 'VTV'})
            title = response.xpath(
                '//meta[@property="og:title"]/@content').get()
            link = response.url
            article.update({'title': title, 'link': link})
            # author
            content = ''
            author = ''
            for text in response.xpath(
                    '(//p[@class="news-info"]/b/text())|(//p[@class="author"]/text())'
            ).getall():
                author += text.strip()
            article.update({'author': author})
            for text in response.xpath(
                    '(//div[@id="entry-body"]/p/text())|(//div[@class="w638 mgl96"]/div[@class="ta-justify"]/p/text())'
            ).getall():
                content += text.strip()
            article.update({'content_article': content})
            word_count = len(content.split())
            article.update({'word_count': word_count})
            # get image
            thumbnail = response.xpath(
                '(//div[@class="infomationdetail clearfix"]/img/@src)|(//div[@class="noidung"]/img/@src)|(//div[@type="Photo"]/div/img/@src)|(//figure[@class="LayoutAlbumItem"]/a/img/@src)'
            ).getall()
            if thumbnail is not None:
                article.update({'thumbnail': thumbnail})
            else:
                article.update({'thumbnail': '-1'})

            # get relate_url
            relate_url = []
            htags = response.xpath('//div[@class="clearfix pdb20"]/ul/li')
            for tag in htags:
                relate_urls = {}
                headline = tag.xpath('a/@title').get()
                if headline is not []:
                    url = "https://vtv.vn" + str(
                        tag.xpath('a/@href').extract_first())
                    relate_urls.update({'headline': headline, 'url': url})
                    relate_url.append(relate_urls)
                article.update({"related_url": relate_url})
            objectid = response.xpath(
                '//div[@class="aspNetHidden"]/input[@id="hdNewsId"]/@value'
            ).get()
            cmt_resquest = 'https://sharefb.cnnd.vn/?urls=http://vtv.vn/news-' + str(
                objectid) + '.htm'
            yield scrapy.Request(
                cmt_resquest,
                callback=self.parse_comment,
                headers={
                    'Accept': 'application/json, text/javascript, */*; q=0.01',
                    'Origin': 'https://vtv.vn',
                    'Sec-Fetch-Mode': 'cors',
                    'Referer': response.url
                },
                meta={'article': article})