Ejemplo n.º 1
0
    def parse_post(self, response):
        blog = Posts()
        blog['domain'] = get_domain(response.url)
        blog['url'] = response.url
        blog['title'] = response.xpath('/html/body/div[1]/div[1]/div/h3/text()').get().strip()
        blog['author'] = get_author(blog['domain'])
        blog['published_date']= dateutil.parser.parse(response.xpath('/html/body/div[1]/div[1]/h2/text()').get())
        blog['content'] = "".join(response.xpath('/html/body/div[1]/div[1]/div/div[2]/p/text()').getall())
        blog['content_html'] = "".join(response.xpath('/html/body/div[1]/div[1]/div/div[2]/p').getall())
        # blog['language'] = get_language(blog['content'])
        blog['links'] = get_links(blog['content_html'])
        yield blog

        #Stats requests
        stat = Stats()
        stat['domain'] = get_domain(response.url)
        stat['url'] = response.url
        stat['views'] = None
        #Getting likes
        post_class = response.xpath('/html/body/@class').get()
        post_id = post_class[post_class.find('postid-')+7 : post_class.find('postid-')+12]
        blog_id = response.xpath('//*[@id="subscribe-blog"]/p[4]/input[2]/@value').get()
        likes_response = requests.get(f"https://public-api.wordpress.com/rest/v1/batch?http_envelope=1&urls[]=/me&urls[]=/sites/{blog_id}/posts/{post_id}/likes&urls[]=/sites/{blog_id}/posts/{post_id}/reblogs/mine").json()
        stat['likes'] = likes_response['body'][f'/sites/{blog_id}/posts/{post_id}/likes']['found']
        stat['comments'] = None
        yield stat

        #Comments (Looks like they're turned off)
        if response.xpath('/html/body/div[1]/div[1]/div/div[3]/span/text()').get() !='Comments Off':
            print("comments on??")
Ejemplo n.º 2
0
    def parse_blog(self, response):
        #HTML Content
        blog_id = response.xpath('/html/head/link[@rel="shortlink"]/@href').get()
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.xpath('//*[@id="block-zerohedge-page-title"]/h1/span/text()').get()
        blog['author'] = response.xpath('//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[1]/span/a/text()').get()
        blog['published_date']= convert_date(response.xpath('//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[2]/span/text()').get())
        blog['content'] = " ".join(response.xpath('//*[@id="block-zerohedge-content"]/article/div/div[1]/p//text()').getall())
        blog['content_html'] = "".join(response.xpath('//*[@id="block-zerohedge-content"]/article/div/div[1]/p').getall())
        blog['links'] = get_links(blog['content_html'])
        blog['tags'] = None
        yield blog
        
        #Stats requests
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = requests.get('https://www.zerohedge.com/statistics-ajax?entity_ids={}'.format(blog_id)).json()[blog_id]
        stat['likes'] = None
        stat['comments'] = requests.get('https://www.zerohedge.com/coral-talk-comment-counts?nids={}'.format(blog_id)).json()[blog_id]
        yield stat

        #Comments requests
        payload = {"query":"query CoralEmbedStream_Embed($assetId: ID, $assetUrl: String, $commentId: ID!, $hasComment: Boolean!, $excludeIgnored: Boolean, $sortBy: SORT_COMMENTS_BY!, $sortOrder: SORT_ORDER!) {\n  me {\n    id\n    state {\n      status {\n        username {\n          status\n          __typename\n        }\n        banned {\n          status\n          __typename\n        }\n        suspension {\n          until\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  asset(id: $assetId, url: $assetUrl) {\n    ...CoralEmbedStream_Configure_asset\n    ...CoralEmbedStream_Stream_asset\n    ...CoralEmbedStream_AutomaticAssetClosure_asset\n    __typename\n  }\n  ...CoralEmbedStream_Stream_root\n  ...CoralEmbedStream_Configure_root\n}\n\nfragment CoralEmbedStream_Stream_root on RootQuery {\n  me {\n    state {\n      status {\n        username {\n          status\n          __typename\n        }\n        banned {\n          status\n          __typename\n        }\n        suspension {\n          until\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    ignoredUsers {\n      id\n      __typename\n    }\n    role\n    __typename\n  }\n  settings {\n    organizationName\n    __typename\n  }\n  ...TalkSlot_StreamFilter_root\n  ...CoralEmbedStream_Comment_root\n  __typename\n}\n\nfragment CoralEmbedStream_Comment_root on RootQuery {\n  me {\n    ignoredUsers {\n      id\n      __typename\n    }\n    __typename\n  }\n  ...TalkSlot_CommentInfoBar_root\n  ...TalkSlot_CommentAuthorName_root\n  ...TalkEmbedStream_DraftArea_root\n  ...TalkEmbedStream_DraftArea_root\n  __typename\n}\n\nfragment TalkEmbedStream_DraftArea_root on RootQuery {\n  __typename\n}\n\nfragment CoralEmbedStream_Stream_asset on Asset {\n  comment(id: $commentId) @include(if: $hasComment) {\n    ...CoralEmbedStream_Stream_comment\n    parent {\n      ...CoralEmbedStream_Stream_singleComment\n      parent {\n        ...CoralEmbedStream_Stream_singleComment\n        parent {\n          ...CoralEmbedStream_Stream_singleComment\n          parent {\n            ...CoralEmbedStream_Stream_singleComment\n            parent {\n              ...CoralEmbedStream_Stream_singleComment\n              __typename\n            }\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  id\n  title\n  url\n  isClosed\n  created_at\n  settings {\n    moderation\n    infoBoxEnable\n    infoBoxContent\n    premodLinksEnable\n    questionBoxEnable\n    questionBoxContent\n    questionBoxIcon\n    closedTimeout\n    closedMessage\n    disableCommenting\n    disableCommentingMessage\n    charCountEnable\n    charCount\n    requireEmailConfirmation\n    __typename\n  }\n  totalCommentCount @skip(if: $hasComment)\n  comments(query: {limit: 50000, excludeIgnored: $excludeIgnored, sortOrder: $sortOrder, sortBy: $sortBy}) @skip(if: $hasComment) {\n    nodes {\n      ...CoralEmbedStream_Stream_comment\n      __typename\n    }\n    hasNextPage\n    startCursor\n    endCursor\n    __typename\n  }\n  ...TalkSlot_StreamFilter_asset\n  ...CoralEmbedStream_Comment_asset\n  __typename\n}\n\nfragment CoralEmbedStream_Comment_asset on Asset {\n  __typename\n  id\n  ...TalkSlot_CommentInfoBar_asset\n  ...TalkSlot_CommentReactions_asset\n  ...TalkSlot_CommentAuthorName_asset\n}\n\nfragment CoralEmbedStream_Stream_comment on Comment {\n  id\n  status\n  user {\n    id\n    __typename\n  }\n  ...CoralEmbedStream_Comment_comment\n  __typename\n}\n\nfragment CoralEmbedStream_Comment_comment on Comment {\n  ...CoralEmbedStream_Comment_SingleComment\n  replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n    nodes {\n      ...CoralEmbedStream_Comment_SingleComment\n      replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n        nodes {\n          ...CoralEmbedStream_Comment_SingleComment\n          replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n            nodes {\n              ...CoralEmbedStream_Comment_SingleComment\n              replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n                nodes {\n                  ...CoralEmbedStream_Comment_SingleComment\n                  replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n                    nodes {\n                      ...CoralEmbedStream_Comment_SingleComment\n                      __typename\n                    }\n                    hasNextPage\n                    startCursor\n                    endCursor\n                    __typename\n                  }\n                  __typename\n                }\n                hasNextPage\n                startCursor\n                endCursor\n                __typename\n              }\n              __typename\n            }\n            hasNextPage\n            startCursor\n            endCursor\n            __typename\n          }\n          __typename\n        }\n        hasNextPage\n        startCursor\n        endCursor\n        __typename\n      }\n      __typename\n    }\n    hasNextPage\n    startCursor\n    endCursor\n    __typename\n  }\n  __typename\n}\n\nfragment CoralEmbedStream_Comment_SingleComment on Comment {\n  id\n  body\n  created_at\n  status\n  replyCount\n  tags {\n    tag {\n      name\n      __typename\n    }\n    __typename\n  }\n  user {\n    id\n    username\n    __typename\n  }\n  status_history {\n    type\n    __typename\n  }\n  action_summaries {\n    __typename\n    count\n    current_user {\n      id\n      __typename\n    }\n  }\n  editing {\n    edited\n    editableUntil\n    __typename\n  }\n  ...TalkSlot_CommentInfoBar_comment\n  ...TalkSlot_CommentReactions_comment\n  ...TalkSlot_CommentAvatar_comment\n  ...TalkSlot_CommentAuthorName_comment\n  ...TalkSlot_CommentContent_comment\n  ...TalkEmbedStream_DraftArea_comment\n  ...TalkEmbedStream_DraftArea_comment\n  __typename\n}\n\nfragment TalkEmbedStream_DraftArea_comment on Comment {\n  __typename\n  ...TalkSlot_DraftArea_comment\n}\n\nfragment CoralEmbedStream_Stream_singleComment on Comment {\n  id\n  status\n  user {\n    id\n    __typename\n  }\n  ...CoralEmbedStream_Comment_SingleComment\n  __typename\n}\n\nfragment CoralEmbedStream_Configure_root on RootQuery {\n  __typename\n  ...CoralEmbedStream_Settings_root\n}\n\nfragment CoralEmbedStream_Settings_root on RootQuery {\n  __typename\n}\n\nfragment CoralEmbedStream_Configure_asset on Asset {\n  __typename\n  ...CoralEmbedStream_AssetStatusInfo_asset\n  ...CoralEmbedStream_Settings_asset\n}\n\nfragment CoralEmbedStream_AssetStatusInfo_asset on Asset {\n  id\n  closedAt\n  isClosed\n  __typename\n}\n\nfragment CoralEmbedStream_Settings_asset on Asset {\n  id\n  settings {\n    moderation\n    premodLinksEnable\n    questionBoxEnable\n    questionBoxIcon\n    questionBoxContent\n    __typename\n  }\n  __typename\n}\n\nfragment CoralEmbedStream_AutomaticAssetClosure_asset on Asset {\n  id\n  closedAt\n  __typename\n}\n\nfragment TalkSlot_StreamFilter_root on RootQuery {\n  ...TalkViewingOptions_ViewingOptions_root\n  __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_root on RootQuery {\n  __typename\n}\n\nfragment TalkSlot_CommentInfoBar_root on RootQuery {\n  ...TalkModerationActions_root\n  __typename\n}\n\nfragment TalkModerationActions_root on RootQuery {\n  me {\n    id\n    __typename\n  }\n  __typename\n}\n\nfragment TalkSlot_CommentAuthorName_root on RootQuery {\n  ...TalkAuthorMenu_AuthorName_root\n  __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_root on RootQuery {\n  __typename\n  ...TalkSlot_AuthorMenuActions_root\n}\n\nfragment TalkSlot_StreamFilter_asset on Asset {\n  ...TalkViewingOptions_ViewingOptions_asset\n  __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_asset on Asset {\n  __typename\n}\n\nfragment TalkSlot_CommentInfoBar_asset on Asset {\n  ...TalkModerationActions_asset\n  ...TalkPermalink_Button_asset\n  __typename\n}\n\nfragment TalkModerationActions_asset on Asset {\n  id\n  __typename\n}\n\nfragment TalkPermalink_Button_asset on Asset {\n  url\n  __typename\n}\n\nfragment TalkSlot_CommentReactions_asset on Asset {\n  ...VoteButton_asset\n  __typename\n}\n\nfragment VoteButton_asset on Asset {\n  id\n  __typename\n}\n\nfragment TalkSlot_CommentAuthorName_asset on Asset {\n  ...TalkAuthorMenu_AuthorName_asset\n  __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_asset on Asset {\n  __typename\n}\n\nfragment TalkSlot_CommentInfoBar_comment on Comment {\n  ...CollapseCommentButton_comment\n  ...TalkModerationActions_comment\n  ...TalkPermalink_Button_comment\n  ...TalkInfoBar_moveReportButton_Comment\n  ...TalkInfoBar_addEdiableClass_Comment\n  __typename\n}\n\nfragment CollapseCommentButton_comment on Comment {\n  id\n  replyCount\n  __typename\n}\n\nfragment TalkModerationActions_comment on Comment {\n  id\n  status\n  user {\n    id\n    __typename\n  }\n  tags {\n    tag {\n      name\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment TalkPermalink_Button_comment on Comment {\n  id\n  __typename\n}\n\nfragment TalkInfoBar_moveReportButton_Comment on Comment {\n  id\n  __typename\n}\n\nfragment TalkInfoBar_addEdiableClass_Comment on Comment {\n  id\n  editing {\n    __typename\n    editableUntil\n  }\n  __typename\n}\n\nfragment TalkSlot_CommentReactions_comment on Comment {\n  ...TalkDisableDeepReplies_disableDeepReplies_Comment\n  ...VoteButton_comment\n  __typename\n}\n\nfragment TalkDisableDeepReplies_disableDeepReplies_Comment on Comment {\n  id\n  __typename\n}\n\nfragment VoteButton_comment on Comment {\n  id\n  action_summaries {\n    __typename\n    ... on UpvoteActionSummary {\n      count\n      current_user {\n        id\n        __typename\n      }\n      __typename\n    }\n    ... on DownvoteActionSummary {\n      count\n      current_user {\n        id\n        __typename\n      }\n      __typename\n    }\n  }\n  __typename\n}\n\nfragment TalkSlot_CommentAvatar_comment on Comment {\n  ...UserAvatar_comment\n  __typename\n}\n\nfragment UserAvatar_comment on Comment {\n  user {\n    avatar\n    __typename\n  }\n  __typename\n}\n\nfragment TalkSlot_CommentAuthorName_comment on Comment {\n  ...TalkAuthorMenu_AuthorName_comment\n  __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_comment on Comment {\n  __typename\n  id\n  user {\n    username\n    __typename\n  }\n  ...TalkSlot_AuthorMenuActions_comment\n}\n\nfragment TalkSlot_CommentContent_comment on Comment {\n  ...TalkPluginRichText_CommentContent_comment\n  __typename\n}\n\nfragment TalkPluginRichText_CommentContent_comment on Comment {\n  body\n  richTextBody\n  __typename\n}\n\nfragment TalkSlot_DraftArea_comment on Comment {\n  ...TalkPluginRichText_Editor_comment\n  __typename\n}\n\nfragment TalkPluginRichText_Editor_comment on Comment {\n  body\n  richTextBody\n  __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_root on RootQuery {\n  ...TalkIgnoreUser_IgnoreUserAction_root\n  __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_root on RootQuery {\n  me {\n    id\n    __typename\n  }\n  __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_comment on Comment {\n  ...TalkIgnoreUser_IgnoreUserAction_comment\n  ...TalkDrupalUserId_DrupalProfile_comment\n  __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_comment on Comment {\n  user {\n    id\n    __typename\n  }\n  ...TalkIgnoreUser_IgnoreUserConfirmation_comment\n  __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserConfirmation_comment on Comment {\n  user {\n    id\n    username\n    __typename\n  }\n  __typename\n}\n\nfragment TalkDrupalUserId_DrupalProfile_comment on Comment {\n  user {\n    id\n    __typename\n  }\n  __typename\n}\n","variables":{"assetId":"","assetUrl":blog['url'],"commentId":"","hasComment":False,"excludeIgnored":False,"sortBy":"CREATED_AT","sortOrder":"DESC"},"operationName":"CoralEmbedStream_Embed"}
        yield scrapy.Request('https://talk.zerohedge.com/api/v1/graph/ql',
                                    method = 'POST',
                                    body=json.dumps(payload), 
                                    headers={'Content-Type':'application/json'},
                                    callback=self.process_comments)
Ejemplo n.º 3
0
    def parse_blog(self, response):
        #There are dead links on the pages that can't be processed
        if not "The page you are looking for cannot be found or is no longer available." in response.text:
            blog = Posts()
            blog['domain'] = self.domain
            blog['url'] = response.url
            blog['title'] = response.xpath(
                '//*[@id="Headline"]/h1/text()').get()
            blog['author'] = response.xpath(
                '//*[@id="Headline"]/p/span/a/text()').get()
            blog['published_date'] = parse(
                response.xpath(
                    '//*[@id="Headline"]/p/span/text()').get().replace(
                        ' by: ', ''))
            blog['content'] = "".join(
                response.xpath('//*[@id="Col2"]/article/div[2]//text()').
                getall()).strip().replace('\n', ' ')
            blog['content_html'] = response.xpath(
                '//*[@id="Col2"]/article/div[2]').get()
            blog['links'] = get_links(blog['content_html'])
            blog['tags'] = tags_to_json(
                response.xpath(
                    '//*[@id="Headline"]/p/span/i/a/text()').getall())
            yield blog

            #Stats
            stat = Stats()
            stat['domain'] = self.domain
            stat['url'] = response.url
            stat['views'] = get_views(response.url)
            stat['likes'] = None
            comment_data = get_comments(response.url)
            stat['comments'] = comment_data['total']
            yield stat

            for comment in comment_data['comments']:
                #Comments
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = response.url
                parsed_comment['comment_id'] = comment['id']
                parsed_comment['username'] = comment['author'][
                    'username'] if 'username' in comment[
                        'author'] else comment['author']['name']
                parsed_comment['user_id'] = comment['author'][
                    'id'] if 'id' in comment['author'] else None
                parsed_comment['comment'] = comment['raw_message']
                parsed_comment['comment_original'] = comment['message']
                parsed_comment['links'] = get_links(comment['message'])
                parsed_comment['upvotes'] = comment['likes']
                parsed_comment['downvotes'] = comment['dislikes']
                parsed_comment['published_date'] = parse(comment['createdAt'])
                parsed_comment['reply_count'] = len([
                    x for x in comment_data['comments']
                    if str(x['parent']) == str(comment['id'])
                ])
                parsed_comment['reply_to'] = comment['parent']
                yield parsed_comment
Ejemplo n.º 4
0
    def parse_blog(self, response):
        url = unquote(response.url)
        blog = Posts()
        try:
            blog['domain'] = self.domain
            blog['url'] = url
            blog['title'] = response.xpath(
                '//h1[contains(@class, "css-1ln1egd")]/text()').get()
            blog['author'] = get_author(response)
            blog['published_date'] = parse(
                response.xpath(
                    '//*[contains(@class, "css-1fboxhy")]/a/text()').get())
            blog['content'] = " ".join(
                response.xpath('//*[contains(@class, "css-118w07p")]/p//text()'
                               ).getall()).strip().replace('\n', '')
            blog['content_html'] = response.xpath(
                '//*[contains(@class, "css-118w07p")]').get()
            blog['links'] = get_links(
                response.xpath('//*[contains(@class, "css-118w07p")]').get())
            blog['tags'] = None
            if blog['title']: yield blog
        except Exception as e:
            print(":here")

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = url
        stat['views'] = None
        stat['likes'] = None
        comments = get_comments(response.url)
        stat['comments'] = comments['total'] if comments else None
        yield stat

        #Comments
        for comment in comments['comments']:
            if comment[
                    'body'] != None:  #sometimes there are empty reply comments
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = url
                parsed_comment['comment_id'] = comment['id']
                parsed_comment['username'] = comment['user']['username']
                parsed_comment['user_id'] = comment['user']['id']
                parsed_comment['comment'] = comment['body']
                original_comment = comment['richTextBody'] if comment[
                    'richTextBody'] is not None else comment['body']
                parsed_comment['comment_original'] = original_comment
                parsed_comment['links'] = get_links(original_comment)
                parsed_comment['upvotes'] = comment['action_summaries'][0][
                    'count'] if comment['action_summaries'] else None
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = parse(comment['created_at'])
                parsed_comment['reply_count'] = comment['replyCount']
                parsed_comment['reply_to'] = comments['replies'][
                    parsed_comment['comment_id']] if parsed_comment[
                        'comment_id'] in comments['replies'] else None
                yield parsed_comment
Ejemplo n.º 5
0
    def parse_blog(self, response):
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.xpath(
            '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/h3/text()').get(
            ).strip()
        blog['author'] = response.xpath(
            '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[1]/span/text()'
        ).get()
        blog['published_date'] = parse(
            response.xpath(
                '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[2]/a/abbr/@title'
            ).get())
        blog['content'] = " ".join(
            response.xpath(
                "//div[contains(@class, 'post-body entry-content')]//text()").
            getall()).replace('\n', '')
        blog['content_html'] = response.xpath(
            "//div[contains(@class, 'post-body entry-content')]").get()
        blog['links'] = get_links(
            response.xpath(
                "//div[contains(@class, 'post-body entry-content')]").get())
        tags = response.xpath(
            '//*[contains(@id, "Blog1")]/div[1]/div/div/div/div[1]/div[3]/div[2]/span//text()'
        ).getall()
        blog['tags'] = tags_to_json(
            list(
                filter(
                    lambda a: a != ',\n' and a != '\n' and 'Labels:' not in a,
                    tags)))
        yield blog

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = None
        comment_num = response.xpath('//div[@id="comments"]/h4/text()').get()
        if "No" not in comment_num:
            stat['comments'] = int(re.search(r'\d+', comment_num).group())
        else:
            stat['comments'] = None
        yield stat

        #Comments
        if "No" not in comment_num:
            for c in response.xpath('//*[@id="top-ra"]//li'):
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = response.url
                parsed_comment['comment_id'] = c.xpath(
                    '//li[contains(@class, "comment")]/@id').get()
                if c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()'
                ).get() is not None:
                    parsed_comment['username'] = c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()'
                    ).get()
                    parsed_comment['user_id'] = c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/div/cite/a/@href'
                    ).get().replace('https://www.blogger.com/profile/', "")
                else:
                    parsed_comment['username'] = c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/div/cite/text()'
                    ).get()
                    parsed_comment['user_id'] = None
                parsed_comment['comment'] = " ".join(
                    c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/p//text()').
                    getall())
                parsed_comment['comment_original'] = c.xpath(
                    '//li[contains(@class, "comment")]/div[2]/p').get()
                parsed_comment['links'] = get_links(
                    c.xpath(
                        '//li[contains(@class, "comment")]/div[2]/p').get())
                parsed_comment['upvotes'] = None
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = None
                parsed_comment['reply_count'] = None
                parsed_comment['reply_to'] = None
                yield parsed_comment
Ejemplo n.º 6
0
    def parse_article(self, response):
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.xpath(
            '//*[@id="js-post-container"]/div/div[1]/header/h1/text()').get()
        blog['author'] = response.xpath(
            '//*[@id="js-post-container"]/div/div[1]/header/div[2]/a/span/span[1]/text()'
        ).get()
        blog['published_date'] = get_date(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/header/div[3]/p/text()'
            ).get())
        blog['content'] = ' '.join(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()').
            getall()).strip()
        blog['content_html'] = " ".join(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall())
        blog['links'] = get_links(blog['content_html'])
        yield blog

        #Comments requests
        article_url = response.url.replace(
            'https://www.buzzfeednews.com/article/', '')
        comments, authors = get_comments(article_url)
        reply_dic = get_reply_dic(comments)
        if comments:  #Catches no comments
            for c in comments:
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = response.url
                parsed_comment['comment_id'] = c['id']
                parsed_comment['username'] = [
                    x['name'] for x in authors if c['authorID'] == x['id']
                ][0]
                parsed_comment['user_id'] = c['authorID']
                parsed_comment['comment'] = c['body']['text']
                parsed_comment['comment_original'] = c['body']['text']
                parsed_comment['links'] = get_links(c['body']['text'])
                parsed_comment['upvotes'] = c['likeCount']
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = dateutil.parser.parse(
                    c['timestamp']['text'])
                if 'public_replies' in c:
                    parsed_comment['reply_count'] = len([
                        x for x in comments
                        if 'targetID' in x and x['targetID'] == c['id']
                    ])
                else:
                    parsed_comment['reply_count'] = 0
                if c['id'] in reply_dic:
                    parsed_comment['reply_to'] = reply_dic[c['id']]
                else:
                    parsed_comment['reply_to'] = None
                yield parsed_comment

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = None
        if comments is None:
            stat['comments'] = 0
        else:
            stat['comments'] = len(comments)
        yield stat
Ejemplo n.º 7
0
    def parse_blog(self, response):
        # Posts
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = parse_title(response) 

        date, author = parse_author_date(response)    
        author = parse_author(response)
        blog['author'] = str(author)[0:99] if author else None
        blog['published_date'] = parse_datetime(date) if date else None

        content, content_html = parse_content(response)
        if content_html:
            blog['content'] = content
            blog['content_html'] = content_html
            blog['links'] = get_links(content_html)
            tags = response.xpath('//*[@id="article-tags"]/div//text()').extract()
            blog['tags'] = tags_to_json(list(filter(lambda x: '\n' not in x and '\t' not in x, tags))) if tags else None
            yield blog

            # Comments
            comment_data = facebook_comments(response.url, 318812448281278)
            comments = comment_data['comments']
            authors = comment_data['authors']
            reply_dic = comment_data['reply_dic']
            if comments: #Catches no comments
                for c in comments: 
                    parsed_comment = Comments()
                    parsed_comment['domain'] = self.domain
                    parsed_comment['url'] = response.url
                    parsed_comment['comment_id'] = c['id']
                    parsed_comment['username'] = [x['name'] for x in authors if c['authorID'] == x['id']][0]
                    parsed_comment['user_id'] = c['authorID']
                    parsed_comment['comment'] = c['body']['text']
                    parsed_comment['comment_original'] = None
                    parsed_comment['links'] = get_links(c['body']['text'])
                    parsed_comment['upvotes'] = c['likeCount']
                    parsed_comment['downvotes'] = None
                    parsed_comment['published_date'] = parse_datetime(c['timestamp']['text'])
                    if 'public_replies' in c:
                        parsed_comment['reply_count'] = len([x for x in comments if 'targetID' in x and x['targetID'] == c['id']])
                    else:
                        parsed_comment['reply_count'] = 0
                    if c['id'] in reply_dic:
                        parsed_comment['reply_to'] = reply_dic[c['id']]
                    else:
                        parsed_comment['reply_to'] = None
                    yield parsed_comment

            # #Stats
            stat = Stats()
            stat['domain'] = self.domain
            stat['url'] = response.url
            stat['views'] = None
            stat['likes'] = None
            if comments is None: 
                stat['comments'] = 0
            else:
                stat['comments'] = len(comments) 
            yield stat
Ejemplo n.º 8
0
    def parse_blog(self, response):
        # Posts
        script = response.xpath(
            "//script[contains(., 'identity')]/text()").extract_first()
        try:
            data = json.loads(script)
        except Exception as e:
            print(str(e) + f"\n{str(response.url)}")
            data = {}

        if data:
            blog = Posts()
            blog['domain'] = get_domain(response.url)
            blog['url'] = response.url
            blog['title'] = response.css(
                '.article-title::text').extract_first()
            author, date = parse_authors_date(data) if data else None
            blog['author'] = author.replace(" ",
                                            "").strip() if author else None
            blog['published_date'] = date if date else None
            blog['tags'] = tags_to_json(
                data['page']['tags']) if 'page' in data else None
            blog['content'] = get_content(response)
            blog['content_html'] = " ".join(
                response.xpath('//*[@class="article-content"]').extract())
            blog['links'] = get_links(" ".join(
                response.xpath('//*[@class="article-content"]').extract()))
            yield blog
        else:
            print('here')
            pass

        # Comments
        article_id = data['page']['articleId'] if 'page' in data else None
        comments = get_torontosun_comments(article_id)
        if comments:  #Catches no comments
            for c in comments:
                if 'content' in c and c['content']:  #Skipping empty comments
                    parsed_comment = Comments()
                    parsed_comment['domain'] = self.domain
                    parsed_comment['url'] = response.url
                    parsed_comment['comment_id'] = c['content_uuid']
                    parsed_comment[
                        'username'] = None  #Could not find API to get this
                    parsed_comment['user_id'] = c['actor_uuid']
                    parsed_comment[
                        'comment'] = c['content'] if 'content' in c else None
                    parsed_comment['comment_original'] = None
                    parsed_comment['links'] = get_links(
                        c['content']) if 'content' in c else None
                    parsed_comment['upvotes'] = c['total_likes']
                    parsed_comment['downvotes'] = c['total_dislikes']
                    parsed_comment['published_date'] = parse_datetime(
                        time.strftime('%m/%d/%Y %H:%M:%S',
                                      time.gmtime(c['date_created'] / 1000.)))
                    if c['total_replies'] > 0:
                        parsed_comment['reply_count'] = c['total_replies']
                    else:
                        parsed_comment['reply_count'] = 0
                    if c['content_container_uuid'] != c['thread_uuid'] and c[
                            'content_container_uuid'] != c['parent_uuid']:
                        parsed_comment['reply_to'] = c['thread_uuid']
                    else:
                        parsed_comment['reply_to'] = None
                    yield parsed_comment

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = None
        if comments is None:
            stat['comments'] = 0
        else:
            stat['comments'] = len(comments)
        yield stat
Ejemplo n.º 9
0
    def parse_blog(self, response):
        # Posts
        pid = response.css('article').xpath(
            "@id").extract_first().strip().replace(' ', '').split('-')[1]
        data = get_stats_data(pid)

        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.css("h1.entry-title::text").extract_first()
        blog['author'] = parse_author(data, response)
        blog['published_date'] = parse(
            response.css(
                "span.entry-meta-date.updated a::text").extract_first())
        blog['content'] = "".join(
            response.xpath("//div[contains(@class, 'entry-content')]//text()").
            extract()).strip().replace('\n', ' ').replace('\t', ' ')
        blog['content_html'] = "".join(
            response.xpath(
                "//div[contains(@class, 'entry-content')]").extract())
        blog['links'] = get_links("".join(
            response.xpath(
                "//div[contains(@class, 'entry-content')]").extract()))
        blog['tags'] = tags_to_json(list(
            data['tags'].keys())) if data['tags'] else None
        yield blog

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = data['like_count']
        stat['comments'] = data['comment_count']
        yield stat

        #Comments
        pids__ = [
            comment.strip().replace(' ', '').split('-')[1]
            for comment in response.css("li").xpath("@id").extract()
        ]
        req = build_batch(pids__, 70000375)
        res = make_api_request(req)
        if res:
            for res_url in res:
                res_values = res[res_url]
                #-Parsing comments
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = response.url
                parsed_comment['comment_id'] = f"bc_comment_{res_values['ID']}"
                usr_name = res_values['author']['name'].lower(
                ) if res_values['author']['name'] else None
                parsed_comment['username'] = usr_name
                parsed_comment['user_id'] = get_user_id(
                    self.comment_users, usr_name)
                parsed_comment['comment'] = res_values['raw_content']
                parsed_comment['comment_original'] = res_values['content']
                parsed_comment['links'] = get_links(res_values['content'])
                stats_links = res_values['meta']['links']
                parsed_comment['upvotes'] = parse_comments(
                    stats_links['likes'])
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = res_values['date']
                #-Getting replies
                replies = get_wordpress_replies(res, res_values['ID'],
                                                70000375)
                parsed_comment['reply_count'] = replies[0]
                parsed_comment['reply_to'] = replies[1]
                yield parsed_comment
Ejemplo n.º 10
0
    def parse_article(self, response):
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.xpath(
            '//*[@class="news-article-header__title"]/text()').get()
        blog['author'] = response.xpath(
            '//*[@class="news-byline-full__info-wrapper"]/span/text()').get()
        blog['published_date'] = get_date(
            response.xpath(
                '//*[@class="news-article-header__timestamps-posted"]/text()').
            get())
        blog['content'] = " ".join(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()').
            getall()).strip()
        blog['content_html'] = " ".join(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall())
        blog['links'] = get_links(blog['content_html'])
        blog['tags'] = tags_to_json(parse_tags(response))
        yield blog

        #Comments requests
        article_url = format_comment_url(response.url)
        comment_data = facebook_comments(article_url, '162111247988300')
        comments = comment_data['comments']
        authors = comment_data['authors']
        reply_dic = comment_data['reply_dic']
        if comments:  #Catches no comments
            for c in comments:
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = response.url
                parsed_comment['comment_id'] = c['id']
                parsed_comment['username'] = [
                    x['name'] for x in authors if c['authorID'] == x['id']
                ][0]
                parsed_comment['user_id'] = c['authorID']
                parsed_comment['comment'] = c['body']['text']
                parsed_comment['comment_original'] = None
                parsed_comment['links'] = get_links(c['body']['text'])
                parsed_comment['upvotes'] = c['likeCount']
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = parse_datetime(
                    c['timestamp']['text'])
                if 'public_replies' in c:
                    parsed_comment['reply_count'] = len([
                        x for x in comments
                        if 'targetID' in x and x['targetID'] == c['id']
                    ])
                else:
                    parsed_comment['reply_count'] = 0
                if c['id'] in reply_dic:
                    parsed_comment['reply_to'] = reply_dic[c['id']]
                else:
                    parsed_comment['reply_to'] = None
                yield parsed_comment

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = None
        if comments is None:
            stat['comments'] = 0
        else:
            stat['comments'] = len(comments)
        yield stat