def parse_post(self, response): blog = Posts() blog['domain'] = get_domain(response.url) blog['url'] = response.url blog['title'] = response.xpath('/html/body/div[1]/div[1]/div/h3/text()').get().strip() blog['author'] = get_author(blog['domain']) blog['published_date']= dateutil.parser.parse(response.xpath('/html/body/div[1]/div[1]/h2/text()').get()) blog['content'] = "".join(response.xpath('/html/body/div[1]/div[1]/div/div[2]/p/text()').getall()) blog['content_html'] = "".join(response.xpath('/html/body/div[1]/div[1]/div/div[2]/p').getall()) # blog['language'] = get_language(blog['content']) blog['links'] = get_links(blog['content_html']) yield blog #Stats requests stat = Stats() stat['domain'] = get_domain(response.url) stat['url'] = response.url stat['views'] = None #Getting likes post_class = response.xpath('/html/body/@class').get() post_id = post_class[post_class.find('postid-')+7 : post_class.find('postid-')+12] blog_id = response.xpath('//*[@id="subscribe-blog"]/p[4]/input[2]/@value').get() likes_response = requests.get(f"https://public-api.wordpress.com/rest/v1/batch?http_envelope=1&urls[]=/me&urls[]=/sites/{blog_id}/posts/{post_id}/likes&urls[]=/sites/{blog_id}/posts/{post_id}/reblogs/mine").json() stat['likes'] = likes_response['body'][f'/sites/{blog_id}/posts/{post_id}/likes']['found'] stat['comments'] = None yield stat #Comments (Looks like they're turned off) if response.xpath('/html/body/div[1]/div[1]/div/div[3]/span/text()').get() !='Comments Off': print("comments on??")
def parse_blog(self, response): #HTML Content blog_id = response.xpath('/html/head/link[@rel="shortlink"]/@href').get() blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath('//*[@id="block-zerohedge-page-title"]/h1/span/text()').get() blog['author'] = response.xpath('//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[1]/span/a/text()').get() blog['published_date']= convert_date(response.xpath('//*[@id="block-zerohedge-content"]/article/footer/div[1]/div/div[2]/span/text()').get()) blog['content'] = " ".join(response.xpath('//*[@id="block-zerohedge-content"]/article/div/div[1]/p//text()').getall()) blog['content_html'] = "".join(response.xpath('//*[@id="block-zerohedge-content"]/article/div/div[1]/p').getall()) blog['links'] = get_links(blog['content_html']) blog['tags'] = None yield blog #Stats requests stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = requests.get('https://www.zerohedge.com/statistics-ajax?entity_ids={}'.format(blog_id)).json()[blog_id] stat['likes'] = None stat['comments'] = requests.get('https://www.zerohedge.com/coral-talk-comment-counts?nids={}'.format(blog_id)).json()[blog_id] yield stat #Comments requests payload = {"query":"query CoralEmbedStream_Embed($assetId: ID, $assetUrl: String, $commentId: ID!, $hasComment: Boolean!, $excludeIgnored: Boolean, $sortBy: SORT_COMMENTS_BY!, $sortOrder: SORT_ORDER!) {\n me {\n id\n state {\n status {\n username {\n status\n __typename\n }\n banned {\n status\n __typename\n }\n suspension {\n until\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n asset(id: $assetId, url: $assetUrl) {\n ...CoralEmbedStream_Configure_asset\n ...CoralEmbedStream_Stream_asset\n ...CoralEmbedStream_AutomaticAssetClosure_asset\n __typename\n }\n ...CoralEmbedStream_Stream_root\n ...CoralEmbedStream_Configure_root\n}\n\nfragment CoralEmbedStream_Stream_root on RootQuery {\n me {\n state {\n status {\n username {\n status\n __typename\n }\n banned {\n status\n __typename\n }\n suspension {\n until\n __typename\n }\n __typename\n }\n __typename\n }\n ignoredUsers {\n id\n __typename\n }\n role\n __typename\n }\n settings {\n organizationName\n __typename\n }\n ...TalkSlot_StreamFilter_root\n ...CoralEmbedStream_Comment_root\n __typename\n}\n\nfragment CoralEmbedStream_Comment_root on RootQuery {\n me {\n ignoredUsers {\n id\n __typename\n }\n __typename\n }\n ...TalkSlot_CommentInfoBar_root\n ...TalkSlot_CommentAuthorName_root\n ...TalkEmbedStream_DraftArea_root\n ...TalkEmbedStream_DraftArea_root\n __typename\n}\n\nfragment TalkEmbedStream_DraftArea_root on RootQuery {\n __typename\n}\n\nfragment CoralEmbedStream_Stream_asset on Asset {\n comment(id: $commentId) @include(if: $hasComment) {\n ...CoralEmbedStream_Stream_comment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n parent {\n ...CoralEmbedStream_Stream_singleComment\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n id\n title\n url\n isClosed\n created_at\n settings {\n moderation\n infoBoxEnable\n infoBoxContent\n premodLinksEnable\n questionBoxEnable\n questionBoxContent\n questionBoxIcon\n closedTimeout\n closedMessage\n disableCommenting\n disableCommentingMessage\n charCountEnable\n charCount\n requireEmailConfirmation\n __typename\n }\n totalCommentCount @skip(if: $hasComment)\n comments(query: {limit: 50000, excludeIgnored: $excludeIgnored, sortOrder: $sortOrder, sortBy: $sortBy}) @skip(if: $hasComment) {\n nodes {\n ...CoralEmbedStream_Stream_comment\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n ...TalkSlot_StreamFilter_asset\n ...CoralEmbedStream_Comment_asset\n __typename\n}\n\nfragment CoralEmbedStream_Comment_asset on Asset {\n __typename\n id\n ...TalkSlot_CommentInfoBar_asset\n ...TalkSlot_CommentReactions_asset\n ...TalkSlot_CommentAuthorName_asset\n}\n\nfragment CoralEmbedStream_Stream_comment on Comment {\n id\n status\n user {\n id\n __typename\n }\n ...CoralEmbedStream_Comment_comment\n __typename\n}\n\nfragment CoralEmbedStream_Comment_comment on Comment {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n replies(query: {limit : 50000, excludeIgnored: $excludeIgnored}) {\n nodes {\n ...CoralEmbedStream_Comment_SingleComment\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n }\n hasNextPage\n startCursor\n endCursor\n __typename\n }\n __typename\n}\n\nfragment CoralEmbedStream_Comment_SingleComment on Comment {\n id\n body\n created_at\n status\n replyCount\n tags {\n tag {\n name\n __typename\n }\n __typename\n }\n user {\n id\n username\n __typename\n }\n status_history {\n type\n __typename\n }\n action_summaries {\n __typename\n count\n current_user {\n id\n __typename\n }\n }\n editing {\n edited\n editableUntil\n __typename\n }\n ...TalkSlot_CommentInfoBar_comment\n ...TalkSlot_CommentReactions_comment\n ...TalkSlot_CommentAvatar_comment\n ...TalkSlot_CommentAuthorName_comment\n ...TalkSlot_CommentContent_comment\n ...TalkEmbedStream_DraftArea_comment\n ...TalkEmbedStream_DraftArea_comment\n __typename\n}\n\nfragment TalkEmbedStream_DraftArea_comment on Comment {\n __typename\n ...TalkSlot_DraftArea_comment\n}\n\nfragment CoralEmbedStream_Stream_singleComment on Comment {\n id\n status\n user {\n id\n __typename\n }\n ...CoralEmbedStream_Comment_SingleComment\n __typename\n}\n\nfragment CoralEmbedStream_Configure_root on RootQuery {\n __typename\n ...CoralEmbedStream_Settings_root\n}\n\nfragment CoralEmbedStream_Settings_root on RootQuery {\n __typename\n}\n\nfragment CoralEmbedStream_Configure_asset on Asset {\n __typename\n ...CoralEmbedStream_AssetStatusInfo_asset\n ...CoralEmbedStream_Settings_asset\n}\n\nfragment CoralEmbedStream_AssetStatusInfo_asset on Asset {\n id\n closedAt\n isClosed\n __typename\n}\n\nfragment CoralEmbedStream_Settings_asset on Asset {\n id\n settings {\n moderation\n premodLinksEnable\n questionBoxEnable\n questionBoxIcon\n questionBoxContent\n __typename\n }\n __typename\n}\n\nfragment CoralEmbedStream_AutomaticAssetClosure_asset on Asset {\n id\n closedAt\n __typename\n}\n\nfragment TalkSlot_StreamFilter_root on RootQuery {\n ...TalkViewingOptions_ViewingOptions_root\n __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_root on RootQuery {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_root on RootQuery {\n ...TalkModerationActions_root\n __typename\n}\n\nfragment TalkModerationActions_root on RootQuery {\n me {\n id\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_root on RootQuery {\n ...TalkAuthorMenu_AuthorName_root\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_root on RootQuery {\n __typename\n ...TalkSlot_AuthorMenuActions_root\n}\n\nfragment TalkSlot_StreamFilter_asset on Asset {\n ...TalkViewingOptions_ViewingOptions_asset\n __typename\n}\n\nfragment TalkViewingOptions_ViewingOptions_asset on Asset {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_asset on Asset {\n ...TalkModerationActions_asset\n ...TalkPermalink_Button_asset\n __typename\n}\n\nfragment TalkModerationActions_asset on Asset {\n id\n __typename\n}\n\nfragment TalkPermalink_Button_asset on Asset {\n url\n __typename\n}\n\nfragment TalkSlot_CommentReactions_asset on Asset {\n ...VoteButton_asset\n __typename\n}\n\nfragment VoteButton_asset on Asset {\n id\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_asset on Asset {\n ...TalkAuthorMenu_AuthorName_asset\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_asset on Asset {\n __typename\n}\n\nfragment TalkSlot_CommentInfoBar_comment on Comment {\n ...CollapseCommentButton_comment\n ...TalkModerationActions_comment\n ...TalkPermalink_Button_comment\n ...TalkInfoBar_moveReportButton_Comment\n ...TalkInfoBar_addEdiableClass_Comment\n __typename\n}\n\nfragment CollapseCommentButton_comment on Comment {\n id\n replyCount\n __typename\n}\n\nfragment TalkModerationActions_comment on Comment {\n id\n status\n user {\n id\n __typename\n }\n tags {\n tag {\n name\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment TalkPermalink_Button_comment on Comment {\n id\n __typename\n}\n\nfragment TalkInfoBar_moveReportButton_Comment on Comment {\n id\n __typename\n}\n\nfragment TalkInfoBar_addEdiableClass_Comment on Comment {\n id\n editing {\n __typename\n editableUntil\n }\n __typename\n}\n\nfragment TalkSlot_CommentReactions_comment on Comment {\n ...TalkDisableDeepReplies_disableDeepReplies_Comment\n ...VoteButton_comment\n __typename\n}\n\nfragment TalkDisableDeepReplies_disableDeepReplies_Comment on Comment {\n id\n __typename\n}\n\nfragment VoteButton_comment on Comment {\n id\n action_summaries {\n __typename\n ... on UpvoteActionSummary {\n count\n current_user {\n id\n __typename\n }\n __typename\n }\n ... on DownvoteActionSummary {\n count\n current_user {\n id\n __typename\n }\n __typename\n }\n }\n __typename\n}\n\nfragment TalkSlot_CommentAvatar_comment on Comment {\n ...UserAvatar_comment\n __typename\n}\n\nfragment UserAvatar_comment on Comment {\n user {\n avatar\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_CommentAuthorName_comment on Comment {\n ...TalkAuthorMenu_AuthorName_comment\n __typename\n}\n\nfragment TalkAuthorMenu_AuthorName_comment on Comment {\n __typename\n id\n user {\n username\n __typename\n }\n ...TalkSlot_AuthorMenuActions_comment\n}\n\nfragment TalkSlot_CommentContent_comment on Comment {\n ...TalkPluginRichText_CommentContent_comment\n __typename\n}\n\nfragment TalkPluginRichText_CommentContent_comment on Comment {\n body\n richTextBody\n __typename\n}\n\nfragment TalkSlot_DraftArea_comment on Comment {\n ...TalkPluginRichText_Editor_comment\n __typename\n}\n\nfragment TalkPluginRichText_Editor_comment on Comment {\n body\n richTextBody\n __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_root on RootQuery {\n ...TalkIgnoreUser_IgnoreUserAction_root\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_root on RootQuery {\n me {\n id\n __typename\n }\n __typename\n}\n\nfragment TalkSlot_AuthorMenuActions_comment on Comment {\n ...TalkIgnoreUser_IgnoreUserAction_comment\n ...TalkDrupalUserId_DrupalProfile_comment\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserAction_comment on Comment {\n user {\n id\n __typename\n }\n ...TalkIgnoreUser_IgnoreUserConfirmation_comment\n __typename\n}\n\nfragment TalkIgnoreUser_IgnoreUserConfirmation_comment on Comment {\n user {\n id\n username\n __typename\n }\n __typename\n}\n\nfragment TalkDrupalUserId_DrupalProfile_comment on Comment {\n user {\n id\n __typename\n }\n __typename\n}\n","variables":{"assetId":"","assetUrl":blog['url'],"commentId":"","hasComment":False,"excludeIgnored":False,"sortBy":"CREATED_AT","sortOrder":"DESC"},"operationName":"CoralEmbedStream_Embed"} yield scrapy.Request('https://talk.zerohedge.com/api/v1/graph/ql', method = 'POST', body=json.dumps(payload), headers={'Content-Type':'application/json'}, callback=self.process_comments)
def parse_blog(self, response): #There are dead links on the pages that can't be processed if not "The page you are looking for cannot be found or is no longer available." in response.text: blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="Headline"]/h1/text()').get() blog['author'] = response.xpath( '//*[@id="Headline"]/p/span/a/text()').get() blog['published_date'] = parse( response.xpath( '//*[@id="Headline"]/p/span/text()').get().replace( ' by: ', '')) blog['content'] = "".join( response.xpath('//*[@id="Col2"]/article/div[2]//text()'). getall()).strip().replace('\n', ' ') blog['content_html'] = response.xpath( '//*[@id="Col2"]/article/div[2]').get() blog['links'] = get_links(blog['content_html']) blog['tags'] = tags_to_json( response.xpath( '//*[@id="Headline"]/p/span/i/a/text()').getall()) yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = get_views(response.url) stat['likes'] = None comment_data = get_comments(response.url) stat['comments'] = comment_data['total'] yield stat for comment in comment_data['comments']: #Comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = comment['id'] parsed_comment['username'] = comment['author'][ 'username'] if 'username' in comment[ 'author'] else comment['author']['name'] parsed_comment['user_id'] = comment['author'][ 'id'] if 'id' in comment['author'] else None parsed_comment['comment'] = comment['raw_message'] parsed_comment['comment_original'] = comment['message'] parsed_comment['links'] = get_links(comment['message']) parsed_comment['upvotes'] = comment['likes'] parsed_comment['downvotes'] = comment['dislikes'] parsed_comment['published_date'] = parse(comment['createdAt']) parsed_comment['reply_count'] = len([ x for x in comment_data['comments'] if str(x['parent']) == str(comment['id']) ]) parsed_comment['reply_to'] = comment['parent'] yield parsed_comment
def parse_blog(self, response): url = unquote(response.url) blog = Posts() try: blog['domain'] = self.domain blog['url'] = url blog['title'] = response.xpath( '//h1[contains(@class, "css-1ln1egd")]/text()').get() blog['author'] = get_author(response) blog['published_date'] = parse( response.xpath( '//*[contains(@class, "css-1fboxhy")]/a/text()').get()) blog['content'] = " ".join( response.xpath('//*[contains(@class, "css-118w07p")]/p//text()' ).getall()).strip().replace('\n', '') blog['content_html'] = response.xpath( '//*[contains(@class, "css-118w07p")]').get() blog['links'] = get_links( response.xpath('//*[contains(@class, "css-118w07p")]').get()) blog['tags'] = None if blog['title']: yield blog except Exception as e: print(":here") #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = url stat['views'] = None stat['likes'] = None comments = get_comments(response.url) stat['comments'] = comments['total'] if comments else None yield stat #Comments for comment in comments['comments']: if comment[ 'body'] != None: #sometimes there are empty reply comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = url parsed_comment['comment_id'] = comment['id'] parsed_comment['username'] = comment['user']['username'] parsed_comment['user_id'] = comment['user']['id'] parsed_comment['comment'] = comment['body'] original_comment = comment['richTextBody'] if comment[ 'richTextBody'] is not None else comment['body'] parsed_comment['comment_original'] = original_comment parsed_comment['links'] = get_links(original_comment) parsed_comment['upvotes'] = comment['action_summaries'][0][ 'count'] if comment['action_summaries'] else None parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse(comment['created_at']) parsed_comment['reply_count'] = comment['replyCount'] parsed_comment['reply_to'] = comments['replies'][ parsed_comment['comment_id']] if parsed_comment[ 'comment_id'] in comments['replies'] else None yield parsed_comment
def parse_blog(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/h3/text()').get( ).strip() blog['author'] = response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[1]/span/text()' ).get() blog['published_date'] = parse( response.xpath( '//*[@id="Blog1"]/div[1]/div/div/div/div[1]/div[3]/div[1]/span[2]/a/abbr/@title' ).get()) blog['content'] = " ".join( response.xpath( "//div[contains(@class, 'post-body entry-content')]//text()"). getall()).replace('\n', '') blog['content_html'] = response.xpath( "//div[contains(@class, 'post-body entry-content')]").get() blog['links'] = get_links( response.xpath( "//div[contains(@class, 'post-body entry-content')]").get()) tags = response.xpath( '//*[contains(@id, "Blog1")]/div[1]/div/div/div/div[1]/div[3]/div[2]/span//text()' ).getall() blog['tags'] = tags_to_json( list( filter( lambda a: a != ',\n' and a != '\n' and 'Labels:' not in a, tags))) yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None comment_num = response.xpath('//div[@id="comments"]/h4/text()').get() if "No" not in comment_num: stat['comments'] = int(re.search(r'\d+', comment_num).group()) else: stat['comments'] = None yield stat #Comments if "No" not in comment_num: for c in response.xpath('//*[@id="top-ra"]//li'): parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c.xpath( '//li[contains(@class, "comment")]/@id').get() if c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()' ).get() is not None: parsed_comment['username'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/text()' ).get() parsed_comment['user_id'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/a/@href' ).get().replace('https://www.blogger.com/profile/', "") else: parsed_comment['username'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/div/cite/text()' ).get() parsed_comment['user_id'] = None parsed_comment['comment'] = " ".join( c.xpath( '//li[contains(@class, "comment")]/div[2]/p//text()'). getall()) parsed_comment['comment_original'] = c.xpath( '//li[contains(@class, "comment")]/div[2]/p').get() parsed_comment['links'] = get_links( c.xpath( '//li[contains(@class, "comment")]/div[2]/p').get()) parsed_comment['upvotes'] = None parsed_comment['downvotes'] = None parsed_comment['published_date'] = None parsed_comment['reply_count'] = None parsed_comment['reply_to'] = None yield parsed_comment
def parse_article(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@id="js-post-container"]/div/div[1]/header/h1/text()').get() blog['author'] = response.xpath( '//*[@id="js-post-container"]/div/div[1]/header/div[2]/a/span/span[1]/text()' ).get() blog['published_date'] = get_date( response.xpath( '//*[@id="js-post-container"]/div/div[1]/header/div[3]/p/text()' ).get()) blog['content'] = ' '.join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()'). getall()).strip() blog['content_html'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall()) blog['links'] = get_links(blog['content_html']) yield blog #Comments requests article_url = response.url.replace( 'https://www.buzzfeednews.com/article/', '') comments, authors = get_comments(article_url) reply_dic = get_reply_dic(comments) if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [ x['name'] for x in authors if c['authorID'] == x['id'] ][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = c['body']['text'] parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = dateutil.parser.parse( c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([ x for x in comments if 'targetID' in x and x['targetID'] == c['id'] ]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = parse_title(response) date, author = parse_author_date(response) author = parse_author(response) blog['author'] = str(author)[0:99] if author else None blog['published_date'] = parse_datetime(date) if date else None content, content_html = parse_content(response) if content_html: blog['content'] = content blog['content_html'] = content_html blog['links'] = get_links(content_html) tags = response.xpath('//*[@id="article-tags"]/div//text()').extract() blog['tags'] = tags_to_json(list(filter(lambda x: '\n' not in x and '\t' not in x, tags))) if tags else None yield blog # Comments comment_data = facebook_comments(response.url, 318812448281278) comments = comment_data['comments'] authors = comment_data['authors'] reply_dic = comment_data['reply_dic'] if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [x['name'] for x in authors if c['authorID'] == x['id']][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = None parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse_datetime(c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([x for x in comments if 'targetID' in x and x['targetID'] == c['id']]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment # #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts script = response.xpath( "//script[contains(., 'identity')]/text()").extract_first() try: data = json.loads(script) except Exception as e: print(str(e) + f"\n{str(response.url)}") data = {} if data: blog = Posts() blog['domain'] = get_domain(response.url) blog['url'] = response.url blog['title'] = response.css( '.article-title::text').extract_first() author, date = parse_authors_date(data) if data else None blog['author'] = author.replace(" ", "").strip() if author else None blog['published_date'] = date if date else None blog['tags'] = tags_to_json( data['page']['tags']) if 'page' in data else None blog['content'] = get_content(response) blog['content_html'] = " ".join( response.xpath('//*[@class="article-content"]').extract()) blog['links'] = get_links(" ".join( response.xpath('//*[@class="article-content"]').extract())) yield blog else: print('here') pass # Comments article_id = data['page']['articleId'] if 'page' in data else None comments = get_torontosun_comments(article_id) if comments: #Catches no comments for c in comments: if 'content' in c and c['content']: #Skipping empty comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['content_uuid'] parsed_comment[ 'username'] = None #Could not find API to get this parsed_comment['user_id'] = c['actor_uuid'] parsed_comment[ 'comment'] = c['content'] if 'content' in c else None parsed_comment['comment_original'] = None parsed_comment['links'] = get_links( c['content']) if 'content' in c else None parsed_comment['upvotes'] = c['total_likes'] parsed_comment['downvotes'] = c['total_dislikes'] parsed_comment['published_date'] = parse_datetime( time.strftime('%m/%d/%Y %H:%M:%S', time.gmtime(c['date_created'] / 1000.))) if c['total_replies'] > 0: parsed_comment['reply_count'] = c['total_replies'] else: parsed_comment['reply_count'] = 0 if c['content_container_uuid'] != c['thread_uuid'] and c[ 'content_container_uuid'] != c['parent_uuid']: parsed_comment['reply_to'] = c['thread_uuid'] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts pid = response.css('article').xpath( "@id").extract_first().strip().replace(' ', '').split('-')[1] data = get_stats_data(pid) blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.css("h1.entry-title::text").extract_first() blog['author'] = parse_author(data, response) blog['published_date'] = parse( response.css( "span.entry-meta-date.updated a::text").extract_first()) blog['content'] = "".join( response.xpath("//div[contains(@class, 'entry-content')]//text()"). extract()).strip().replace('\n', ' ').replace('\t', ' ') blog['content_html'] = "".join( response.xpath( "//div[contains(@class, 'entry-content')]").extract()) blog['links'] = get_links("".join( response.xpath( "//div[contains(@class, 'entry-content')]").extract())) blog['tags'] = tags_to_json(list( data['tags'].keys())) if data['tags'] else None yield blog #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = data['like_count'] stat['comments'] = data['comment_count'] yield stat #Comments pids__ = [ comment.strip().replace(' ', '').split('-')[1] for comment in response.css("li").xpath("@id").extract() ] req = build_batch(pids__, 70000375) res = make_api_request(req) if res: for res_url in res: res_values = res[res_url] #-Parsing comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = f"bc_comment_{res_values['ID']}" usr_name = res_values['author']['name'].lower( ) if res_values['author']['name'] else None parsed_comment['username'] = usr_name parsed_comment['user_id'] = get_user_id( self.comment_users, usr_name) parsed_comment['comment'] = res_values['raw_content'] parsed_comment['comment_original'] = res_values['content'] parsed_comment['links'] = get_links(res_values['content']) stats_links = res_values['meta']['links'] parsed_comment['upvotes'] = parse_comments( stats_links['likes']) parsed_comment['downvotes'] = None parsed_comment['published_date'] = res_values['date'] #-Getting replies replies = get_wordpress_replies(res, res_values['ID'], 70000375) parsed_comment['reply_count'] = replies[0] parsed_comment['reply_to'] = replies[1] yield parsed_comment
def parse_article(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@class="news-article-header__title"]/text()').get() blog['author'] = response.xpath( '//*[@class="news-byline-full__info-wrapper"]/span/text()').get() blog['published_date'] = get_date( response.xpath( '//*[@class="news-article-header__timestamps-posted"]/text()'). get()) blog['content'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()'). getall()).strip() blog['content_html'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall()) blog['links'] = get_links(blog['content_html']) blog['tags'] = tags_to_json(parse_tags(response)) yield blog #Comments requests article_url = format_comment_url(response.url) comment_data = facebook_comments(article_url, '162111247988300') comments = comment_data['comments'] authors = comment_data['authors'] reply_dic = comment_data['reply_dic'] if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [ x['name'] for x in authors if c['authorID'] == x['id'] ][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = None parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse_datetime( c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([ x for x in comments if 'targetID' in x and x['targetID'] == c['id'] ]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat