def parse_blog(self, response): url = unquote(response.url) blog = Posts() try: blog['domain'] = self.domain blog['url'] = url blog['title'] = response.xpath( '//h1[contains(@class, "css-1ln1egd")]/text()').get() blog['author'] = get_author(response) blog['published_date'] = parse( response.xpath( '//*[contains(@class, "css-1fboxhy")]/a/text()').get()) blog['content'] = " ".join( response.xpath('//*[contains(@class, "css-118w07p")]/p//text()' ).getall()).strip().replace('\n', '') blog['content_html'] = response.xpath( '//*[contains(@class, "css-118w07p")]').get() blog['links'] = get_links( response.xpath('//*[contains(@class, "css-118w07p")]').get()) blog['tags'] = None if blog['title']: yield blog except Exception as e: print(":here") #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = url stat['views'] = None stat['likes'] = None comments = get_comments(response.url) stat['comments'] = comments['total'] if comments else None yield stat #Comments for comment in comments['comments']: if comment[ 'body'] != None: #sometimes there are empty reply comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = url parsed_comment['comment_id'] = comment['id'] parsed_comment['username'] = comment['user']['username'] parsed_comment['user_id'] = comment['user']['id'] parsed_comment['comment'] = comment['body'] original_comment = comment['richTextBody'] if comment[ 'richTextBody'] is not None else comment['body'] parsed_comment['comment_original'] = original_comment parsed_comment['links'] = get_links(original_comment) parsed_comment['upvotes'] = comment['action_summaries'][0][ 'count'] if comment['action_summaries'] else None parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse(comment['created_at']) parsed_comment['reply_count'] = comment['replyCount'] parsed_comment['reply_to'] = comments['replies'][ parsed_comment['comment_id']] if parsed_comment[ 'comment_id'] in comments['replies'] else None yield parsed_comment
def parse_blog(self, response): #There are dead links on the pages that can't be processed if not "The page you are looking for cannot be found or is no longer available." in response.text: blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="Headline"]/h1/text()').get() blog['author'] = response.xpath( '//*[@id="Headline"]/p/span/a/text()').get() blog['published_date'] = parse( response.xpath( '//*[@id="Headline"]/p/span/text()').get().replace( ' by: ', '')) blog['content'] = "".join( response.xpath('//*[@id="Col2"]/article/div[2]//text()'). getall()).strip().replace('\n', ' ') blog['content_html'] = response.xpath( '//*[@id="Col2"]/article/div[2]').get() blog['links'] = get_links(blog['content_html']) blog['tags'] = tags_to_json( response.xpath( '//*[@id="Headline"]/p/span/i/a/text()').getall()) yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = get_views(response.url) stat['likes'] = None comment_data = get_comments(response.url) stat['comments'] = comment_data['total'] yield stat for comment in comment_data['comments']: #Comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = comment['id'] parsed_comment['username'] = comment['author'][ 'username'] if 'username' in comment[ 'author'] else comment['author']['name'] parsed_comment['user_id'] = comment['author'][ 'id'] if 'id' in comment['author'] else None parsed_comment['comment'] = comment['raw_message'] parsed_comment['comment_original'] = comment['message'] parsed_comment['links'] = get_links(comment['message']) parsed_comment['upvotes'] = comment['likes'] parsed_comment['downvotes'] = comment['dislikes'] parsed_comment['published_date'] = parse(comment['createdAt']) parsed_comment['reply_count'] = len([ x for x in comment_data['comments'] if str(x['parent']) == str(comment['id']) ]) parsed_comment['reply_to'] = comment['parent'] yield parsed_comment
def parse_blog(self, response): #HTML Content blog_id = response.xpath('/html/head/link[@rel="shortlink"]/@href').get() blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath('//*[@id="block-zerohedge-page-title"]/h1/span/text()').get() blog['author'] = response.xpath('//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[1]/span/a/text()').get() blog['published_date']= convert_date(response.xpath('//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[2]/span/text()').get()) blog['content'] = " ".join(response.xpath('//*[@id="block-zerohedge-content"]/article/div/div[1]/p//text()').getall()) blog['content_html'] = "".join(response.xpath('//*[@id="block-zerohedge-content"]/article/div/div[1]/p').getall()) blog['links'] = get_links(blog['content_html']) blog['tags'] = None yield blog #Stats requests stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = requests.get('https://www.zerohedge.com/statistics-ajax?entity_ids={}'.format(blog_id)).json()[blog_id] stat['likes'] = None stat['comments'] = requests.get('https://www.zerohedge.com/coral-talk-comment-counts?nids={}'.format(blog_id)).json()[blog_id] yield stat #Comments requests payload = {"query":"query CoralEmbedStream_Embed($assetId: ID, $assetUrl: String, $commentId: ID!, $hasComment: Boolean!, $excludeIgnored: Boolean, $sortBy: SORT_COMMENTS_BY!, $sortOrder: SORT_ORDER!) {\n me {\n id\n state {\n status {\n username {\n status\n __typename\n }\n banned {\n status\n __typename\n }\n suspension {\n until\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n asset(id: $assetId, url: $assetUrl) {\n ...CoralEmbedStream_Configure_asset\n ...CoralEmbedStream_Stream_asset\n ...CoralEmbedStream_AutomaticAssetClosure_asset\n __typename\n }\n ...CoralEmbedStream_Stream_root\n ...CoralEmbedStream_Configure_root\n}\n\nfragment CoralEmbedStream_Stream_root on RootQuery {\n me {\n state {\n status {\n username {\n status\n __typename\n }\n banned {\n status\n __typename\n }\n suspension {\n until\n __typename\n }\n __typename\n }\n __typename\n }\n ignoredUsers {\n id\n __typename\n }\n role\n __typename\n }\n settings {\n organizationName\n __typename\n }\n ...TalkSlot_StreamFilter_root\n ...CoralEmbedStream_Comment_root\n __typename\n}\n\nfragment CoralEmbedStream_Comment_root on RootQuery {\n me {\n ignoredUsers {\n id\n __typename\n }\n __typename\n }\n ...TalkSlot_CommentInfoBar_root\n ...TalkSlot_CommentAuthorName_root\n ...TalkEmbedStream_DraftArea_root\n ...TalkEmbedStream_DraftArea_root\n __typename\n}\n\nfragment TalkEmbedStream_DraftArea_root on RootQuery {\n __typename\n}\n\nfragment CoralEmbedStream_Stream_asset on Asset {\n comment(id: $commentId) @include(if: $hasComment) {\n ...CoralEmbedStream_Stream_comment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n id\n title\n url\n isClosed\n created_at\n settings {\n moderation\n infoBoxEnable\n infoBoxContent\n premodLinksEnable\n questionBoxEnable\n questionBoxContent\n questionBoxIcon\n closedTimeout\n closedMessage\n disableCommenting\n disableCommentingMessage\n charCountEnable\n charCount\n requireEmailConfirmation\n __typename\n }\n totalCommentCount @skip(if: $hasComment)\n comments(query: {limit: 50000, excludeIgnored: $excludeIgnored, sortOrder: $sortOrder, sortBy: $sortBy}) @skip(if: $hasComment) {\n nodes {\n ...CoralEmbedStream_Stream_comment\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n ...TalkSlot_StreamFilter_asset\n ...CoralEmbedStream_Comment_asset\n __typename\n}\n\nfragment CoralEmbedStream_Comment_asset on Asset {\n __typename\n id\n ...TalkSlot_CommentInfoBar_asset\n ...TalkSlot_CommentReactions_asset\n ...TalkSlot_CommentAuthorName_asset\n}\n\nfragment CoralEmbedStream_Stream_comment on Comment {\n id\n status\n user {\n id\n __typename\n }\n ...CoralEmbedStream_Comment_comment\n __typename\n}\n\nfragment CoralEmbedStream_Comment_comment on Comment {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n}\n\nfragment CoralEmbedStream_Comment_SingleComment on Comment {\n id\n body\n created_at\n status\n replyCount\n tags {\n tag {\n name\n __typename\n }\n __typename\n }\n user {\n id\n username\n __typename\n }\n status_history {\n type\n __typename\n }\n action_summaries {\n __typename\n count\n current_user {\n id\n __typename\n }\n }\n editing {\n edited\n editableUntil\n __typename\n }\n ...TalkSlot_CommentInfoBar_comment\n ...TalkSlot_CommentReactions_comment\n ...TalkSlot_CommentAvatar_comment\n ...TalkSlot_CommentAuthorName_comment\n ...TalkSlot_CommentContent_comment\n ...TalkEmbedStream_DraftArea_comment\n ...TalkEmbedStream_DraftArea_comment\n __typename\n}\n\nfragment TalkEmbedStream_DraftArea_comment on Comment {\n __typename\n ...TalkSlot_DraftArea_comment\n}\n\nfragment CoralEmbedStream_Stream_singleComment on Comment {\n id\n status\n user {\n id\n __typename\n }\n ...CoralEmbedStream_Comment_SingleComment\n __typename\n}\n\nfragment CoralEmbedStream_Configure_root on RootQuery {\n __typename\n ...CoralEmbedStream_Settings_root\n}\n\nfragment CoralEmbedStream_Settings_root on RootQuery {\n __typename\n}\n\nfragment CoralEmbedStream_Configure_asset on Asset {\n __typename\n ...CoralEmbedStream_AssetStatusInfo_asset\n ...CoralEmbedStream_Settings_asset\n}\n\nfragment CoralEmbedStream_AssetStatusInfo_asset on Asset {\n id\n closedAt\n isClosed\n __typename\n}\n\nfragment CoralEmbedStream_Settings_asset on Asset {\n id\n settings {\n moderation\n premodLinksEnable\n questionBoxEnable\n questionBoxIcon\n questionBoxContent\n __typename\n }\n __typename\n}\n\nfragment CoralEmbedStream_AutomaticAssetClosure_asset on Asset {\n id\n closedAt\n __typename\n}\n\nfragment TalkSlot_StreamFilter_root on RootQuery {\n ...TalkViewingOptions_ViewingOptions_root\n __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_root on RootQuery {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_root on RootQuery {\n ...TalkModerationActions_root\n __typename\n}\n\nfragment TalkModerationActions_root on RootQuery {\n me {\n id\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_root on RootQuery {\n ...TalkAuthorMenu_AuthorName_root\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_root on RootQuery {\n __typename\n ...TalkSlot_AuthorMenuActions_root\n}\n\nfragment TalkSlot_StreamFilter_asset on Asset {\n ...TalkViewingOptions_ViewingOptions_asset\n __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_asset on Asset {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_asset on Asset {\n ...TalkModerationActions_asset\n ...TalkPermalink_Button_asset\n __typename\n}\n\nfragment TalkModerationActions_asset on Asset {\n id\n __typename\n}\n\nfragment TalkPermalink_Button_asset on Asset {\n url\n __typename\n}\n\nfragment TalkSlot_CommentReactions_asset on Asset {\n ...VoteButton_asset\n __typename\n}\n\nfragment VoteButton_asset on Asset {\n id\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_asset on Asset {\n ...TalkAuthorMenu_AuthorName_asset\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_asset on Asset {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_comment on Comment {\n ...CollapseCommentButton_comment\n ...TalkModerationActions_comment\n ...TalkPermalink_Button_comment\n ...TalkInfoBar_moveReportButton_Comment\n ...TalkInfoBar_addEdiableClass_Comment\n __typename\n}\n\nfragment CollapseCommentButton_comment on Comment {\n id\n replyCount\n __typename\n}\n\nfragment TalkModerationActions_comment on Comment {\n id\n status\n user {\n id\n __typename\n }\n tags {\n tag {\n name\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment TalkPermalink_Button_comment on Comment {\n id\n __typename\n}\n\nfragment TalkInfoBar_moveReportButton_Comment on Comment {\n id\n __typename\n}\n\nfragment TalkInfoBar_addEdiableClass_Comment on Comment {\n id\n editing {\n __typename\n editableUntil\n }\n __typename\n}\n\nfragment TalkSlot_CommentReactions_comment on Comment {\n ...TalkDisableDeepReplies_disableDeepReplies_Comment\n ...VoteButton_comment\n __typename\n}\n\nfragment TalkDisableDeepReplies_disableDeepReplies_Comment on Comment {\n id\n __typename\n}\n\nfragment VoteButton_comment on Comment {\n id\n action_summaries {\n __typename\n ... on UpvoteActionSummary {\n count\n current_user {\n id\n __typename\n }\n __typename\n }\n ... on DownvoteActionSummary {\n count\n current_user {\n id\n __typename\n }\n __typename\n }\n }\n __typename\n}\n\nfragment TalkSlot_CommentAvatar_comment on Comment {\n ...UserAvatar_comment\n __typename\n}\n\nfragment UserAvatar_comment on Comment {\n user {\n avatar\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_comment on Comment {\n ...TalkAuthorMenu_AuthorName_comment\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_comment on Comment {\n __typename\n id\n user {\n username\n __typename\n }\n ...TalkSlot_AuthorMenuActions_comment\n}\n\nfragment TalkSlot_CommentContent_comment on Comment {\n ...TalkPluginRichText_CommentContent_comment\n __typename\n}\n\nfragment TalkPluginRichText_CommentContent_comment on Comment {\n body\n richTextBody\n __typename\n}\n\nfragment TalkSlot_DraftArea_comment on Comment {\n ...TalkPluginRichText_Editor_comment\n __typename\n}\n\nfragment TalkPluginRichText_Editor_comment on Comment {\n body\n richTextBody\n __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_root on RootQuery {\n ...TalkIgnoreUser_IgnoreUserAction_root\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_root on RootQuery {\n me {\n id\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_comment on Comment {\n ...TalkIgnoreUser_IgnoreUserAction_comment\n ...TalkDrupalUserId_DrupalProfile_comment\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_comment on Comment {\n user {\n id\n __typename\n }\n ...TalkIgnoreUser_IgnoreUserConfirmation_comment\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserConfirmation_comment on Comment {\n user {\n id\n username\n __typename\n }\n __typename\n}\n\nfragment TalkDrupalUserId_DrupalProfile_comment on Comment {\n user {\n id\n __typename\n }\n __typename\n}\n","variables":{"assetId":"","assetUrl":blog['url'],"commentId":"","hasComment":False,"excludeIgnored":False,"sortBy":"CREATED_AT","sortOrder":"DESC"},"operationName":"CoralEmbedStream_Embed"} yield scrapy.Request('https://talk.zerohedge.com/api/v1/graph/ql', method = 'POST', body=json.dumps(payload), headers={'Content-Type':'application/json'}, callback=self.process_comments)
def parse_blog(self, response): # Post blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.css('.entry-title::text').extract_first() blog['author'] = response.xpath( '//*[contains(@id,"post-")]/div[1]/div/div/header/div/div/a//text()' ).extract_first() date = response.css('.updated::text').extract_first() blog['published_date'] = parse_datetime(date) if date else None #-Cleaning Post posts = "".join( response.xpath("//div[contains(@class, 'td-post-content')]//text()" ).extract()).strip().replace('\n', ' ').replace( '\t', ' ') posts = str(re.sub(' +', ' ', posts)) blog['content'] = posts.replace( 'Facebook Twitter reddit LinkedIn', '' ).replace( 'We’re asking readers, like you, to make a contribution in support of True North’s fact-based, independent journalism.\r Unlike the mainstream media, True North isn’t getting a government bailout. Instead, we depend on the generosity of Canadians like you.\r How can a media outlet be trusted to remain neutral and fair if they’re beneficiaries of a government handout? We don’t think they can.\r This is why independent media in Canada is more important than ever. If you’re able, please make a tax-deductible donation to True North today. Thank you so much.', '').replace('\r', ' ') blog['content_html'] = "".join( response.xpath( "//div[contains(@class, 'td-post-content')]").extract()) blog['links'] = get_links("".join( response.xpath( "//div[contains(@class, 'td-post-content')]").extract())) blog['tags'] = None yield blog
def parse_blog(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( "//*[contains(@class, 'BlogItem-title')]/text()").get() blog['author'] = response.xpath( "//*[contains(@class, 'Blog-meta-item Blog-meta-item--author')]/text()" ).get() blog['published_date'] = parse( response.xpath( "//*[contains(@class, 'Blog-meta-item Blog-meta-item--date')]/text()" ).get()) content = " ".join( response.xpath( '//*[contains(@class, "col sqs-col-12 span-12")]//p//text()'). getall() ).replace( 'More Articles Subscribe to receive exclusive news and content from The Canada Files! Enter your email address powered by TinyLetter', '' ).replace( 'Subscribe to receive exclusive news and content from The Canada Files! Enter your email address powered by TinyLetter', '') blog['content'] = ' '.join(content.split()) blog['content_html'] = response.xpath( '//*[contains(@class, "col sqs-col-12 span-12")]').get() blog['links'] = get_links( response.xpath( '//*[contains(@class, "col sqs-col-12 span-12")]').get()) blog['tags'] = None yield blog
def parse_blog(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[contains(@class, "post-title entry-title")]/a/text()').get() blog['author'] = None blog['published_date'] = None blog['content'] = "".join( response.xpath('//*[contains(@class, "entry-content")]//text()'). getall()).replace("\n", " ") blog['content_html'] = response.xpath( '//*[contains(@class, "entry-content")]').get() blog['links'] = get_links( response.xpath('//*[contains(@class, "entry-content")]').get()) yield blog
def process_comment(data, blog_url, parent_comment=None): parsed_comments = [] #Parser for response & replies for comment in data: c = Comments() c['domain'] = 'zerohedge.com' c['url'] = blog_url c['comment_id'] = comment['id'] if comment['user']: c['username'] = comment['user']['username'] c['user_id'] = comment['user']['id'] else: c['username'] = None c['user_id'] = None c['comment'] = comment['body'] c['comment_original'] = comment['richTextBody'] c['links'] = get_links(comment['richTextBody']) #Setting defulat values for votes c['upvotes'] = 0 c['downvotes'] = 0 #Seeing if there are any votes, overwritting defaults for action in comment['action_summaries']: if action['__typename'] == 'UpvoteActionSummary': c['upvotes'] = action['count'] if action['__typename'] == 'DownvoteActionSummary': c['downvotes'] = action['count'] c['published_date'] = convert_date(comment['created_at']) c['reply_count'] = comment['replyCount'] c['reply_to'] = parent_comment parsed_comments.append(c) #Managing replies try: replies = comment['replies']['nodes'] except: #Sometimes with no replies, there is no replies field replies = None if replies: parsed_comments += process_comment(replies, blog_url, parent_comment=comment['id']) #Finished return parsed_comments
def parse_blog(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/h3/text()').get( ).strip() blog['author'] = response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[1]/span/text()' ).get() blog['published_date'] = parse( response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[2]/a/abbr/@title' ).get()) blog['content'] = " ".join( response.xpath( "//div[contains(@class, 'post-body entry-content')]//text()"). getall()).replace('\n', '') blog['content_html'] = response.xpath( "//div[contains(@class, 'post-body entry-content')]").get() blog['links'] = get_links( response.xpath( "//div[contains(@class, 'post-body entry-content')]").get()) tags = response.xpath( '//*[contains(@id, "Blog1")]/div[1]/div/div/div/div[1]/div[3]/div[2]/span//text()' ).getall() blog['tags'] = tags_to_json( list( filter( lambda a: a != ',\n' and a != '\n' and 'Labels:' not in a, tags))) yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None comment_num = response.xpath('//div[@id="comments"]/h4/text()').get() if "No" not in comment_num: stat['comments'] = int(re.search(r'\d+', comment_num).group()) else: stat['comments'] = None yield stat #Comments if "No" not in comment_num: for c in response.xpath('//*[@id="top-ra"]//li'): parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c.xpath( '//li[contains(@class, "comment")]/@id').get() if c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()' ).get() is not None: parsed_comment['username'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()' ).get() parsed_comment['user_id'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/@href' ).get().replace('https://www.blogger.com/profile/', "") else: parsed_comment['username'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/text()' ).get() parsed_comment['user_id'] = None parsed_comment['comment'] = " ".join( c.xpath( '//li[contains(@class, "comment")]/div[2]/p//text()'). getall()) parsed_comment['comment_original'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/p').get() parsed_comment['links'] = get_links( c.xpath( '//li[contains(@class, "comment")]/div[2]/p').get()) parsed_comment['upvotes'] = None parsed_comment['downvotes'] = None parsed_comment['published_date'] = None parsed_comment['reply_count'] = None parsed_comment['reply_to'] = None yield parsed_comment
def parse_article(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="js-post-container"]/div/div[1]/header/h1/text()').get() blog['author'] = response.xpath( '//*[@id="js-post-container"]/div/div[1]/header/div[2]/a/span/span[1]/text()' ).get() blog['published_date'] = get_date( response.xpath( '//*[@id="js-post-container"]/div/div[1]/header/div[3]/p/text()' ).get()) blog['content'] = ' '.join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()'). getall()).strip() blog['content_html'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall()) blog['links'] = get_links(blog['content_html']) yield blog #Comments requests article_url = response.url.replace( 'https://www.buzzfeednews.com/article/', '') comments, authors = get_comments(article_url) reply_dic = get_reply_dic(comments) if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [ x['name'] for x in authors if c['authorID'] == x['id'] ][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = c['body']['text'] parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = dateutil.parser.parse( c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([ x for x in comments if 'targetID' in x and x['targetID'] == c['id'] ]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = parse_title(response) date, author = parse_author_date(response) author = parse_author(response) blog['author'] = str(author)[0:99] if author else None blog['published_date'] = parse_datetime(date) if date else None content, content_html = parse_content(response) if content_html: blog['content'] = content blog['content_html'] = content_html blog['links'] = get_links(content_html) tags = response.xpath('//*[@id="article-tags"]/div//text()').extract() blog['tags'] = tags_to_json(list(filter(lambda x: '\n' not in x and '\t' not in x, tags))) if tags else None yield blog # Comments comment_data = facebook_comments(response.url, 318812448281278) comments = comment_data['comments'] authors = comment_data['authors'] reply_dic = comment_data['reply_dic'] if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [x['name'] for x in authors if c['authorID'] == x['id']][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = None parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse_datetime(c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([x for x in comments if 'targetID' in x and x['targetID'] == c['id']]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment # #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts script = response.xpath( "//script[contains(., 'identity')]/text()").extract_first() try: data = json.loads(script) except Exception as e: print(str(e) + f"\n{str(response.url)}") data = {} if data: blog = Posts() blog['domain'] = get_domain(response.url) blog['url'] = response.url blog['title'] = response.css( '.article-title::text').extract_first() author, date = parse_authors_date(data) if data else None blog['author'] = author.replace(" ", "").strip() if author else None blog['published_date'] = date if date else None blog['tags'] = tags_to_json( data['page']['tags']) if 'page' in data else None blog['content'] = get_content(response) blog['content_html'] = " ".join( response.xpath('//*[@class="article-content"]').extract()) blog['links'] = get_links(" ".join( response.xpath('//*[@class="article-content"]').extract())) yield blog else: print('here') pass # Comments article_id = data['page']['articleId'] if 'page' in data else None comments = get_torontosun_comments(article_id) if comments: #Catches no comments for c in comments: if 'content' in c and c['content']: #Skipping empty comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['content_uuid'] parsed_comment[ 'username'] = None #Could not find API to get this parsed_comment['user_id'] = c['actor_uuid'] parsed_comment[ 'comment'] = c['content'] if 'content' in c else None parsed_comment['comment_original'] = None parsed_comment['links'] = get_links( c['content']) if 'content' in c else None parsed_comment['upvotes'] = c['total_likes'] parsed_comment['downvotes'] = c['total_dislikes'] parsed_comment['published_date'] = parse_datetime( time.strftime('%m/%d/%Y %H:%M:%S', time.gmtime(c['date_created'] / 1000.))) if c['total_replies'] > 0: parsed_comment['reply_count'] = c['total_replies'] else: parsed_comment['reply_count'] = 0 if c['content_container_uuid'] != c['thread_uuid'] and c[ 'content_container_uuid'] != c['parent_uuid']: parsed_comment['reply_to'] = c['thread_uuid'] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts pid = response.css('article').xpath( "@id").extract_first().strip().replace(' ', '').split('-')[1] data = get_stats_data(pid) blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.css("h1.entry-title::text").extract_first() blog['author'] = parse_author(data, response) blog['published_date'] = parse( response.css( "span.entry-meta-date.updated a::text").extract_first()) blog['content'] = "".join( response.xpath("//div[contains(@class, 'entry-content')]//text()"). extract()).strip().replace('\n', ' ').replace('\t', ' ') blog['content_html'] = "".join( response.xpath( "//div[contains(@class, 'entry-content')]").extract()) blog['links'] = get_links("".join( response.xpath( "//div[contains(@class, 'entry-content')]").extract())) blog['tags'] = tags_to_json(list( data['tags'].keys())) if data['tags'] else None yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = data['like_count'] stat['comments'] = data['comment_count'] yield stat #Comments pids__ = [ comment.strip().replace(' ', '').split('-')[1] for comment in response.css("li").xpath("@id").extract() ] req = build_batch(pids__, 70000375) res = make_api_request(req) if res: for res_url in res: res_values = res[res_url] #-Parsing comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = f"bc_comment_{res_values['ID']}" usr_name = res_values['author']['name'].lower( ) if res_values['author']['name'] else None parsed_comment['username'] = usr_name parsed_comment['user_id'] = get_user_id( self.comment_users, usr_name) parsed_comment['comment'] = res_values['raw_content'] parsed_comment['comment_original'] = res_values['content'] parsed_comment['links'] = get_links(res_values['content']) stats_links = res_values['meta']['links'] parsed_comment['upvotes'] = parse_comments( stats_links['likes']) parsed_comment['downvotes'] = None parsed_comment['published_date'] = res_values['date'] #-Getting replies replies = get_wordpress_replies(res, res_values['ID'], 70000375) parsed_comment['reply_count'] = replies[0] parsed_comment['reply_to'] = replies[1] yield parsed_comment
def parse_article(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@class="news-article-header__title"]/text()').get() blog['author'] = response.xpath( '//*[@class="news-byline-full__info-wrapper"]/span/text()').get() blog['published_date'] = get_date( response.xpath( '//*[@class="news-article-header__timestamps-posted"]/text()'). get()) blog['content'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()'). getall()).strip() blog['content_html'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall()) blog['links'] = get_links(blog['content_html']) blog['tags'] = tags_to_json(parse_tags(response)) yield blog #Comments requests article_url = format_comment_url(response.url) comment_data = facebook_comments(article_url, '162111247988300') comments = comment_data['comments'] authors = comment_data['authors'] reply_dic = comment_data['reply_dic'] if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [ x['name'] for x in authors if c['authorID'] == x['id'] ][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = None parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse_datetime( c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([ x for x in comments if 'targetID' in x and x['targetID'] == c['id'] ]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat