Esempio n. 1
0
class ZerohedgeSpider(scrapy.Spider):
    name = 'zerohedge'
    domain = 'zerohedge.com'
    allowed_domains = ['zerohedge.com']
    crawled_urls = get_start_urls(domain)
    start_urls = ['https://www.zerohedge.com/', 'https://www.zerohedge.com/commodities', 'https://www.zerohedge.com/crypto',
        'https://www.zerohedge.com/economics', 'https://www.zerohedge.com/energy', 'https://www.zerohedge.com/geopolitical',
        'https://www.zerohedge.com/health-0', 'https://www.zerohedge.com/markets', 'https://www.zerohedge.com/personal-finance',
        'https://www.zerohedge.com/political', 'https://www.zerohedge.com/technology'] + crawled_urls

    def parse(self, response):
        #Crawling blog posts (from db)
        if response.url in self.crawled_urls:
            yield scrapy.Request(response.url, callback=self.parse_blog)
        else:  #Going through the home pages and getting blogposts
            for article in response.css('Article'):
                blogurl = response.urljoin(article.css('a::attr(href)')[0].get())
                yield scrapy.Request(blogurl, callback=self.parse_blog) 

    def parse_blog(self, response):
        #HTML Content
        blog_id = response.xpath('/html/head/link[@rel="shortlink"]/@href').get()
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.xpath('//*[@id="block-zerohedge-page-title"]/h1/span/text()').get()
        blog['author'] = response.xpath('//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[1]/span/a/text()').get()
        blog['published_date']= convert_date(response.xpath('//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[2]/span/text()').get())
        blog['content'] = " ".join(response.xpath('//*[@id="block-zerohedge-content"]/article/div/div[1]/p//text()').getall())
        blog['content_html'] = "".join(response.xpath('//*[@id="block-zerohedge-content"]/article/div/div[1]/p').getall())
        blog['links'] = get_links(blog['content_html'])
        blog['tags'] = None
        yield blog
        
        #Stats requests
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = requests.get('https://www.zerohedge.com/statistics-ajax?entity_ids={}'.format(blog_id)).json()[blog_id]
        stat['likes'] = None
        stat['comments'] = requests.get('https://www.zerohedge.com/coral-talk-comment-counts?nids={}'.format(blog_id)).json()[blog_id]
        yield stat

        #Comments requests
        payload = {"query":"query CoralEmbedStream_Embed($assetId: ID, $assetUrl: String, $commentId: ID!, $hasComment: Boolean!, $excludeIgnored: Boolean, $sortBy: SORT_COMMENTS_BY!, $sortOrder: SORT_ORDER!) {\n  me {\n    id\n    state {\n      status {\n        username {\n          status\n          __typename\n        }\n        banned {\n          status\n          __typename\n        }\n        suspension {\n          until\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  asset(id: $assetId, url: $assetUrl) {\n    ...CoralEmbedStream_Configure_asset\n    ...CoralEmbedStream_Stream_asset\n    ...CoralEmbedStream_AutomaticAssetClosure_asset\n    __typename\n  }\n  ...CoralEmbedStream_Stream_root\n  ...CoralEmbedStream_Configure_root\n}\n\nfragment CoralEmbedStream_Stream_root on RootQuery {\n  me {\n    state {\n      status {\n        username {\n          status\n          __typename\n        }\n        banned {\n          status\n          __typename\n        }\n        suspension {\n          until\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    ignoredUsers {\n      id\n      __typename\n    }\n    role\n    __typename\n  }\n  settings {\n    organizationName\n    __typename\n  }\n  ...TalkSlot_StreamFilter_root\n  ...CoralEmbedStream_Comment_root\n  __typename\n}\n\nfragment CoralEmbedStream_Comment_root on RootQuery {\n  me {\n    ignoredUsers {\n      id\n      __typename\n    }\n    __typename\n  }\n  ...TalkSlot_CommentInfoBar_root\n  ...TalkSlot_CommentAuthorName_root\n  ...TalkEmbedStream_DraftArea_root\n  ...TalkEmbedStream_DraftArea_root\n  __typename\n}\n\nfragment TalkEmbedStream_DraftArea_root on RootQuery {\n  __typename\n}\n\nfragment CoralEmbedStream_Stream_asset on Asset {\n  comment(id: $commentId) @include(if: $hasComment) {\n    ...CoralEmbedStream_Stream_comment\n    parent {\n      ...CoralEmbedStream_Stream_singleComment\n      parent {\n        ...CoralEmbedStream_Stream_singleComment\n        parent {\n          ...CoralEmbedStream_Stream_singleComment\n          parent {\n            ...CoralEmbedStream_Stream_singleComment\n            parent {\n              ...CoralEmbedStream_Stream_singleComment\n              __typename\n            }\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  id\n  title\n  url\n  isClosed\n  created_at\n  settings {\n    moderation\n    infoBoxEnable\n    infoBoxContent\n    premodLinksEnable\n    questionBoxEnable\n    questionBoxContent\n    questionBoxIcon\n    closedTimeout\n    closedMessage\n    disableCommenting\n    disableCommentingMessage\n    charCountEnable\n    charCount\n    requireEmailConfirmation\n    __typename\n  }\n  totalCommentCount @skip(if: $hasComment)\n  comments(query: {limit: 50000, excludeIgnored: $excludeIgnored, sortOrder: $sortOrder, sortBy: $sortBy}) @skip(if: $hasComment) {\n    nodes {\n      ...CoralEmbedStream_Stream_comment\n      __typename\n    }\n    hasNextPage\n    startCursor\n    endCursor\n    __typename\n  }\n  ...TalkSlot_StreamFilter_asset\n  ...CoralEmbedStream_Comment_asset\n  __typename\n}\n\nfragment CoralEmbedStream_Comment_asset on Asset {\n  __typename\n  id\n  ...TalkSlot_CommentInfoBar_asset\n  ...TalkSlot_CommentReactions_asset\n  ...TalkSlot_CommentAuthorName_asset\n}\n\nfragment CoralEmbedStream_Stream_comment on Comment {\n  id\n  status\n  user {\n    id\n    __typename\n  }\n  ...CoralEmbedStream_Comment_comment\n  __typename\n}\n\nfragment CoralEmbedStream_Comment_comment on Comment {\n  ...CoralEmbedStream_Comment_SingleComment\n  replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n    nodes {\n      ...CoralEmbedStream_Comment_SingleComment\n      replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n        nodes {\n          ...CoralEmbedStream_Comment_SingleComment\n          replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n            nodes {\n              ...CoralEmbedStream_Comment_SingleComment\n              replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n                nodes {\n                  ...CoralEmbedStream_Comment_SingleComment\n                  replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n                    nodes {\n                      ...CoralEmbedStream_Comment_SingleComment\n                      __typename\n                    }\n                    hasNextPage\n                    startCursor\n                    endCursor\n                    __typename\n                  }\n                  __typename\n                }\n                hasNextPage\n                startCursor\n                endCursor\n                __typename\n              }\n              __typename\n            }\n            hasNextPage\n            startCursor\n            endCursor\n            __typename\n          }\n          __typename\n        }\n        hasNextPage\n        startCursor\n        endCursor\n        __typename\n      }\n      __typename\n    }\n    hasNextPage\n    startCursor\n    endCursor\n    __typename\n  }\n  __typename\n}\n\nfragment CoralEmbedStream_Comment_SingleComment on Comment {\n  id\n  body\n  created_at\n  status\n  replyCount\n  tags {\n    tag {\n      name\n      __typename\n    }\n    __typename\n  }\n  user {\n    id\n    username\n    __typename\n  }\n  status_history {\n    type\n    __typename\n  }\n  action_summaries {\n    __typename\n    count\n    current_user {\n      id\n      __typename\n    }\n  }\n  editing {\n    edited\n    editableUntil\n    __typename\n  }\n  ...TalkSlot_CommentInfoBar_comment\n  ...TalkSlot_CommentReactions_comment\n  ...TalkSlot_CommentAvatar_comment\n  ...TalkSlot_CommentAuthorName_comment\n  ...TalkSlot_CommentContent_comment\n  ...TalkEmbedStream_DraftArea_comment\n  ...TalkEmbedStream_DraftArea_comment\n  __typename\n}\n\nfragment TalkEmbedStream_DraftArea_comment on Comment {\n  __typename\n  ...TalkSlot_DraftArea_comment\n}\n\nfragment CoralEmbedStream_Stream_singleComment on Comment {\n  id\n  status\n  user {\n    id\n    __typename\n  }\n  ...CoralEmbedStream_Comment_SingleComment\n  __typename\n}\n\nfragment CoralEmbedStream_Configure_root on RootQuery {\n  __typename\n  ...CoralEmbedStream_Settings_root\n}\n\nfragment CoralEmbedStream_Settings_root on RootQuery {\n  __typename\n}\n\nfragment CoralEmbedStream_Configure_asset on Asset {\n  __typename\n  ...CoralEmbedStream_AssetStatusInfo_asset\n  ...CoralEmbedStream_Settings_asset\n}\n\nfragment CoralEmbedStream_AssetStatusInfo_asset on Asset {\n  id\n  closedAt\n  isClosed\n  __typename\n}\n\nfragment CoralEmbedStream_Settings_asset on Asset {\n  id\n  settings {\n    moderation\n    premodLinksEnable\n    questionBoxEnable\n    questionBoxIcon\n    questionBoxContent\n    __typename\n  }\n  __typename\n}\n\nfragment CoralEmbedStream_AutomaticAssetClosure_asset on Asset {\n  id\n  closedAt\n  __typename\n}\n\nfragment TalkSlot_StreamFilter_root on RootQuery {\n  ...TalkViewingOptions_ViewingOptions_root\n  __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_root on RootQuery {\n  __typename\n}\n\nfragment TalkSlot_CommentInfoBar_root on RootQuery {\n  ...TalkModerationActions_root\n  __typename\n}\n\nfragment TalkModerationActions_root on RootQuery {\n  me {\n    id\n    __typename\n  }\n  __typename\n}\n\nfragment TalkSlot_CommentAuthorName_root on RootQuery {\n  ...TalkAuthorMenu_AuthorName_root\n  __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_root on RootQuery {\n  __typename\n  ...TalkSlot_AuthorMenuActions_root\n}\n\nfragment TalkSlot_StreamFilter_asset on Asset {\n  ...TalkViewingOptions_ViewingOptions_asset\n  __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_asset on Asset {\n  __typename\n}\n\nfragment TalkSlot_CommentInfoBar_asset on Asset {\n  ...TalkModerationActions_asset\n  ...TalkPermalink_Button_asset\n  __typename\n}\n\nfragment TalkModerationActions_asset on Asset {\n  id\n  __typename\n}\n\nfragment TalkPermalink_Button_asset on Asset {\n  url\n  __typename\n}\n\nfragment TalkSlot_CommentReactions_asset on Asset {\n  ...VoteButton_asset\n  __typename\n}\n\nfragment VoteButton_asset on Asset {\n  id\n  __typename\n}\n\nfragment TalkSlot_CommentAuthorName_asset on Asset {\n  ...TalkAuthorMenu_AuthorName_asset\n  __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_asset on Asset {\n  __typename\n}\n\nfragment TalkSlot_CommentInfoBar_comment on Comment {\n  ...CollapseCommentButton_comment\n  ...TalkModerationActions_comment\n  ...TalkPermalink_Button_comment\n  ...TalkInfoBar_moveReportButton_Comment\n  ...TalkInfoBar_addEdiableClass_Comment\n  __typename\n}\n\nfragment CollapseCommentButton_comment on Comment {\n  id\n  replyCount\n  __typename\n}\n\nfragment TalkModerationActions_comment on Comment {\n  id\n  status\n  user {\n    id\n    __typename\n  }\n  tags {\n    tag {\n      name\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment TalkPermalink_Button_comment on Comment {\n  id\n  __typename\n}\n\nfragment TalkInfoBar_moveReportButton_Comment on Comment {\n  id\n  __typename\n}\n\nfragment TalkInfoBar_addEdiableClass_Comment on Comment {\n  id\n  editing {\n    __typename\n    editableUntil\n  }\n  __typename\n}\n\nfragment TalkSlot_CommentReactions_comment on Comment {\n  ...TalkDisableDeepReplies_disableDeepReplies_Comment\n  ...VoteButton_comment\n  __typename\n}\n\nfragment TalkDisableDeepReplies_disableDeepReplies_Comment on Comment {\n  id\n  __typename\n}\n\nfragment VoteButton_comment on Comment {\n  id\n  action_summaries {\n    __typename\n    ... on UpvoteActionSummary {\n      count\n      current_user {\n        id\n        __typename\n      }\n      __typename\n    }\n    ... on DownvoteActionSummary {\n      count\n      current_user {\n        id\n        __typename\n      }\n      __typename\n    }\n  }\n  __typename\n}\n\nfragment TalkSlot_CommentAvatar_comment on Comment {\n  ...UserAvatar_comment\n  __typename\n}\n\nfragment UserAvatar_comment on Comment {\n  user {\n    avatar\n    __typename\n  }\n  __typename\n}\n\nfragment TalkSlot_CommentAuthorName_comment on Comment {\n  ...TalkAuthorMenu_AuthorName_comment\n  __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_comment on Comment {\n  __typename\n  id\n  user {\n    username\n    __typename\n  }\n  ...TalkSlot_AuthorMenuActions_comment\n}\n\nfragment TalkSlot_CommentContent_comment on Comment {\n  ...TalkPluginRichText_CommentContent_comment\n  __typename\n}\n\nfragment TalkPluginRichText_CommentContent_comment on Comment {\n  body\n  richTextBody\n  __typename\n}\n\nfragment TalkSlot_DraftArea_comment on Comment {\n  ...TalkPluginRichText_Editor_comment\n  __typename\n}\n\nfragment TalkPluginRichText_Editor_comment on Comment {\n  body\n  richTextBody\n  __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_root on RootQuery {\n  ...TalkIgnoreUser_IgnoreUserAction_root\n  __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_root on RootQuery {\n  me {\n    id\n    __typename\n  }\n  __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_comment on Comment {\n  ...TalkIgnoreUser_IgnoreUserAction_comment\n  ...TalkDrupalUserId_DrupalProfile_comment\n  __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_comment on Comment {\n  user {\n    id\n    __typename\n  }\n  ...TalkIgnoreUser_IgnoreUserConfirmation_comment\n  __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserConfirmation_comment on Comment {\n  user {\n    id\n    username\n    __typename\n  }\n  __typename\n}\n\nfragment TalkDrupalUserId_DrupalProfile_comment on Comment {\n  user {\n    id\n    __typename\n  }\n  __typename\n}\n","variables":{"assetId":"","assetUrl":blog['url'],"commentId":"","hasComment":False,"excludeIgnored":False,"sortBy":"CREATED_AT","sortOrder":"DESC"},"operationName":"CoralEmbedStream_Embed"}
        yield scrapy.Request('https://talk.zerohedge.com/api/v1/graph/ql',
                                    method = 'POST',
                                    body=json.dumps(payload), 
                                    headers={'Content-Type':'application/json'},
                                    callback=self.process_comments)

    def process_comments(self, response):
        #Processing responses
        post_comments = json.loads(response.body)
        data = post_comments['data']['asset']['comments']['nodes']
        comments = process_comment(data, post_comments['data']['asset']['url']) 
        for comment in comments:
            yield comment
class AccidentaldeliberationsSpider(scrapy.Spider):
    name = 'accidentaldeliberations'
    domain = "accidentaldeliberations.blogspot.com"
    allowed_domains = ['accidentaldeliberations.blogspot.com']
    db_urls = get_start_urls(domain)
    start_urls = ['http://accidentaldeliberations.blogspot.com//'] + db_urls
    # user_agent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
    download_delay = 5

    def parse(self, response):
        #Running db urls
        if response.url in self.db_urls:
            yield scrapy.Request(response.url, self.parse_blog)
        else:  #Scrapying from site
            links = response.xpath(
                "//h3[contains(@class, 'post-title entry-title')]/a/@href"
            ).getall()
            for link in links:
                yield scrapy.Request(link, self.parse_blog)
            #Getting blog archive links
            archive_links = response.xpath(
                "//a[contains(@class, 'post-count-link')]/@href").getall()
            for archive in archive_links:
                yield scrapy.Request(archive, self.parse)

    def parse_blog(self, response):
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.xpath(
            '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/h3/text()').get(
            ).strip()
        blog['author'] = response.xpath(
            '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[1]/span/text()'
        ).get()
        blog['published_date'] = parse(
            response.xpath(
                '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[2]/a/abbr/@title'
            ).get())
        blog['content'] = " ".join(
            response.xpath(
                "//div[contains(@class, 'post-body entry-content')]//text()").
            getall()).replace('\n', '')
        blog['content_html'] = response.xpath(
            "//div[contains(@class, 'post-body entry-content')]").get()
        blog['links'] = get_links(
            response.xpath(
                "//div[contains(@class, 'post-body entry-content')]").get())
        tags = response.xpath(
            '//*[contains(@id, "Blog1")]/div[1]/div/div/div/div[1]/div[3]/div[2]/span//text()'
        ).getall()
        blog['tags'] = tags_to_json(
            list(
                filter(
                    lambda a: a != ',\n' and a != '\n' and 'Labels:' not in a,
                    tags)))
        yield blog

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = None
        comment_num = response.xpath('//div[@id="comments"]/h4/text()').get()
        if "No" not in comment_num:
            stat['comments'] = int(re.search(r'\d+', comment_num).group())
        else:
            stat['comments'] = None
        yield stat

        #Comments
        if "No" not in comment_num:
            for c in response.xpath('//*[@id="top-ra"]//li'):
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = response.url
                parsed_comment['comment_id'] = c.xpath(
                    '//li[contains(@class, "comment")]/@id').get()
                if c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()'
                ).get() is not None:
                    parsed_comment['username'] = c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()'
                    ).get()
                    parsed_comment['user_id'] = c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/div/cite/a/@href'
                    ).get().replace('https://www.blogger.com/profile/', "")
                else:
                    parsed_comment['username'] = c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/div/cite/text()'
                    ).get()
                    parsed_comment['user_id'] = None
                parsed_comment['comment'] = " ".join(
                    c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/p//text()').
                    getall())
                parsed_comment['comment_original'] = c.xpath(
                    '//li[contains(@class, "comment")]/div[2]/p').get()
                parsed_comment['links'] = get_links(
                    c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/p').get())
                parsed_comment['upvotes'] = None
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = None
                parsed_comment['reply_count'] = None
                parsed_comment['reply_to'] = None
                yield parsed_comment
Esempio n. 3
0
class InfowarsSpider(scrapy.Spider):
    name = 'infowars'
    domain = 'infowars.com'
    allowed_domains = ['infowars.com']
    home_pages = [
        'https://www.infowars.com/category/8/',
        'https://www.infowars.com/category/5/',
        'https://www.infowars.com/category/2/',
        'https://www.infowars.com/category/18/',
        'https://www.infowars.com/category/10/',
        'https://www.infowars.com/category/3/',
        'https://www.infowars.com/category/11/',
        'https://www.infowars.com/category/14/',
        'https://www.infowars.com/category/4/'
    ]
    start_urls = home_pages + get_start_urls(domain)

    def parse(self, response):
        return
        #Parsing home pages
        if any(x for x in self.home_pages if x in response.url):
            #Getting articles from page
            for blog_url in response.xpath(
                    '//a[contains(@class, "css-1xjmleq")]/@href').getall():
                yield scrapy.Request(
                    'https://www.infowars.com' + blog_url,
                    self.parse_blog,
                    meta={'dont_redirect': True},
                    headers={
                        'accept':
                        "text/html",
                        "user-agent":
                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
                    })
        #Processsing articles in db
        else:
            yield scrapy.Request(
                response.url,
                self.parse_blog,
                meta={'dont_redirect': True},
                headers={
                    'accept':
                    "text/html",
                    "user-agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
                })

    def parse_blog(self, response):
        url = unquote(response.url)
        blog = Posts()
        try:
            blog['domain'] = self.domain
            blog['url'] = url
            blog['title'] = response.xpath(
                '//h1[contains(@class, "css-1ln1egd")]/text()').get()
            blog['author'] = get_author(response)
            blog['published_date'] = parse(
                response.xpath(
                    '//*[contains(@class, "css-1fboxhy")]/a/text()').get())
            blog['content'] = " ".join(
                response.xpath('//*[contains(@class, "css-118w07p")]/p//text()'
                               ).getall()).strip().replace('\n', '')
            blog['content_html'] = response.xpath(
                '//*[contains(@class, "css-118w07p")]').get()
            blog['links'] = get_links(
                response.xpath('//*[contains(@class, "css-118w07p")]').get())
            blog['tags'] = None
            if blog['title']: yield blog
        except Exception as e:
            print(":here")

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = url
        stat['views'] = None
        stat['likes'] = None
        comments = get_comments(response.url)
        stat['comments'] = comments['total'] if comments else None
        yield stat

        #Comments
        for comment in comments['comments']:
            if comment[
                    'body'] != None:  #sometimes there are empty reply comments
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = url
                parsed_comment['comment_id'] = comment['id']
                parsed_comment['username'] = comment['user']['username']
                parsed_comment['user_id'] = comment['user']['id']
                parsed_comment['comment'] = comment['body']
                original_comment = comment['richTextBody'] if comment[
                    'richTextBody'] is not None else comment['body']
                parsed_comment['comment_original'] = original_comment
                parsed_comment['links'] = get_links(original_comment)
                parsed_comment['upvotes'] = comment['action_summaries'][0][
                    'count'] if comment['action_summaries'] else None
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = parse(comment['created_at'])
                parsed_comment['reply_count'] = comment['replyCount']
                parsed_comment['reply_to'] = comments['replies'][
                    parsed_comment['comment_id']] if parsed_comment[
                        'comment_id'] in comments['replies'] else None
                yield parsed_comment
Esempio n. 4
0
class BuzzfeednewsSpider(scrapy.Spider):
    name = 'buzzfeednews'
    domain = 'buzzfeednews.com'
    allowed_domains = ['buzzfeednews.com']
    start_urls = [
        'https://www.buzzfeednews.com/',
        'https://www.buzzfeednews.com/section/arts-entertainment',
        'https://www.buzzfeednews.com/section/business',
        'https://www.buzzfeednews.com/investigations',
        'https://www.buzzfeednews.com/section/lgbtq',
        'https://www.buzzfeednews.com/collection/opinion',
        'https://www.buzzfeednews.com/section/politics',
        'https://www.buzzfeednews.com/section/reader',
        'https://www.buzzfeednews.com/section/science',
        'https://www.buzzfeednews.com/section/tech',
        'https://www.buzzfeednews.com/section/world'
    ]
    # #Getting API pages
    base_urls = [
        'https://www.buzzfeednews.com/site-component/v1/en-us/trending-on-buzzfeednews'
    ]
    links = get_api_pages_wrapper(base_urls) + get_start_urls(domain)

    def parse(self, response):
        self.links += get_matching_links(
            response.body.decode('utf-8'),
            'https://www.buzzfeednews.com/article/')
        for url in self.links:
            yield scrapy.Request(url, self.parse_article)

    def parse_article(self, response):
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.xpath(
            '//*[@id="js-post-container"]/div/div[1]/header/h1/text()').get()
        blog['author'] = response.xpath(
            '//*[@id="js-post-container"]/div/div[1]/header/div[2]/a/span/span[1]/text()'
        ).get()
        blog['published_date'] = get_date(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/header/div[3]/p/text()'
            ).get())
        blog['content'] = ' '.join(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()').
            getall()).strip()
        blog['content_html'] = " ".join(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall())
        blog['links'] = get_links(blog['content_html'])
        yield blog

        #Comments requests
        article_url = response.url.replace(
            'https://www.buzzfeednews.com/article/', '')
        comments, authors = get_comments(article_url)
        reply_dic = get_reply_dic(comments)
        if comments:  #Catches no comments
            for c in comments:
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = response.url
                parsed_comment['comment_id'] = c['id']
                parsed_comment['username'] = [
                    x['name'] for x in authors if c['authorID'] == x['id']
                ][0]
                parsed_comment['user_id'] = c['authorID']
                parsed_comment['comment'] = c['body']['text']
                parsed_comment['comment_original'] = c['body']['text']
                parsed_comment['links'] = get_links(c['body']['text'])
                parsed_comment['upvotes'] = c['likeCount']
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = dateutil.parser.parse(
                    c['timestamp']['text'])
                if 'public_replies' in c:
                    parsed_comment['reply_count'] = len([
                        x for x in comments
                        if 'targetID' in x and x['targetID'] == c['id']
                    ])
                else:
                    parsed_comment['reply_count'] = 0
                if c['id'] in reply_dic:
                    parsed_comment['reply_to'] = reply_dic[c['id']]
                else:
                    parsed_comment['reply_to'] = None
                yield parsed_comment

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = None
        if comments is None:
            stat['comments'] = 0
        else:
            stat['comments'] = len(comments)
        yield stat