def parse_authors_date(data): if 'page' in data: if 'authors' in data['page'] and 'date' in data['page']: if data['page']['authors'] and data['page']['date']: try: date = parse_datetime(data['page']['date']['pub']) except: date = None author = ' and '.join(data['page']['authors']) if 'by' in author.lower() or 'special to' in author.lower(): author = ''.join( author.lower().split("special to")).replace( 'by', '').strip().title() return author_title(author), date else: if 'author' in data and 'datePublished' in data: try: date = parse_datetime(data['datePublished']) except: date = None author = data['author']['name'] if 'by' in author.lower() or 'special to' in author.lower(): author = ''.join(author.lower().split("special to")).replace( 'by', '').strip().title() return author_title(author), date
def parse_blog(self, response): # Post blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.css('.entry-title::text').extract_first() blog['author'] = response.xpath( '//*[contains(@id,"post-")]/div[1]/div/div/header/div/div/a//text()' ).extract_first() date = response.css('.updated::text').extract_first() blog['published_date'] = parse_datetime(date) if date else None #-Cleaning Post posts = "".join( response.xpath("//div[contains(@class, 'td-post-content')]//text()" ).extract()).strip().replace('\n', ' ').replace( '\t', ' ') posts = str(re.sub(' +', ' ', posts)) blog['content'] = posts.replace( 'Facebook Twitter reddit LinkedIn', '' ).replace( 'We’re asking readers, like you, to make a contribution in support of True North’s fact-based, independent journalism.\r Unlike the mainstream media, True North isn’t getting a government bailout. Instead, we depend on the generosity of Canadians like you.\r How can a media outlet be trusted to remain neutral and fair if they’re beneficiaries of a government handout? We don’t think they can.\r This is why independent media in Canada is more important than ever. If you’re able, please make a tax-deductible donation to True North today. Thank you so much.', '').replace('\r', ' ') blog['content_html'] = "".join( response.xpath( "//div[contains(@class, 'td-post-content')]").extract()) blog['links'] = get_links("".join( response.xpath( "//div[contains(@class, 'td-post-content')]").extract())) blog['tags'] = None yield blog
def parse_blog(self, response): # Posts blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = parse_title(response) date, author = parse_author_date(response) author = parse_author(response) blog['author'] = str(author)[0:99] if author else None blog['published_date'] = parse_datetime(date) if date else None content, content_html = parse_content(response) if content_html: blog['content'] = content blog['content_html'] = content_html blog['links'] = get_links(content_html) tags = response.xpath('//*[@id="article-tags"]/div//text()').extract() blog['tags'] = tags_to_json(list(filter(lambda x: '\n' not in x and '\t' not in x, tags))) if tags else None yield blog # Comments comment_data = facebook_comments(response.url, 318812448281278) comments = comment_data['comments'] authors = comment_data['authors'] reply_dic = comment_data['reply_dic'] if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [x['name'] for x in authors if c['authorID'] == x['id']][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = None parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse_datetime(c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([x for x in comments if 'targetID' in x and x['targetID'] == c['id']]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment # #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_blog(self, response): # Posts script = response.xpath( "//script[contains(., 'identity')]/text()").extract_first() try: data = json.loads(script) except Exception as e: print(str(e) + f"\n{str(response.url)}") data = {} if data: blog = Posts() blog['domain'] = get_domain(response.url) blog['url'] = response.url blog['title'] = response.css( '.article-title::text').extract_first() author, date = parse_authors_date(data) if data else None blog['author'] = author.replace(" ", "").strip() if author else None blog['published_date'] = date if date else None blog['tags'] = tags_to_json( data['page']['tags']) if 'page' in data else None blog['content'] = get_content(response) blog['content_html'] = " ".join( response.xpath('//*[@class="article-content"]').extract()) blog['links'] = get_links(" ".join( response.xpath('//*[@class="article-content"]').extract())) yield blog else: print('here') pass # Comments article_id = data['page']['articleId'] if 'page' in data else None comments = get_torontosun_comments(article_id) if comments: #Catches no comments for c in comments: if 'content' in c and c['content']: #Skipping empty comments parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['content_uuid'] parsed_comment[ 'username'] = None #Could not find API to get this parsed_comment['user_id'] = c['actor_uuid'] parsed_comment[ 'comment'] = c['content'] if 'content' in c else None parsed_comment['comment_original'] = None parsed_comment['links'] = get_links( c['content']) if 'content' in c else None parsed_comment['upvotes'] = c['total_likes'] parsed_comment['downvotes'] = c['total_dislikes'] parsed_comment['published_date'] = parse_datetime( time.strftime('%m/%d/%Y %H:%M:%S', time.gmtime(c['date_created'] / 1000.))) if c['total_replies'] > 0: parsed_comment['reply_count'] = c['total_replies'] else: parsed_comment['reply_count'] = 0 if c['content_container_uuid'] != c['thread_uuid'] and c[ 'content_container_uuid'] != c['parent_uuid']: parsed_comment['reply_to'] = c['thread_uuid'] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def parse_article(self, response): blog = Posts() blog['domain'] = self.domain blog['url'] = response.url blog['title'] = response.xpath( '//*[@class="news-article-header__title"]/text()').get() blog['author'] = response.xpath( '//*[@class="news-byline-full__info-wrapper"]/span/text()').get() blog['published_date'] = get_date( response.xpath( '//*[@class="news-article-header__timestamps-posted"]/text()'). get()) blog['content'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()'). getall()).strip() blog['content_html'] = " ".join( response.xpath( '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall()) blog['links'] = get_links(blog['content_html']) blog['tags'] = tags_to_json(parse_tags(response)) yield blog #Comments requests article_url = format_comment_url(response.url) comment_data = facebook_comments(article_url, '162111247988300') comments = comment_data['comments'] authors = comment_data['authors'] reply_dic = comment_data['reply_dic'] if comments: #Catches no comments for c in comments: parsed_comment = Comments() parsed_comment['domain'] = self.domain parsed_comment['url'] = response.url parsed_comment['comment_id'] = c['id'] parsed_comment['username'] = [ x['name'] for x in authors if c['authorID'] == x['id'] ][0] parsed_comment['user_id'] = c['authorID'] parsed_comment['comment'] = c['body']['text'] parsed_comment['comment_original'] = None parsed_comment['links'] = get_links(c['body']['text']) parsed_comment['upvotes'] = c['likeCount'] parsed_comment['downvotes'] = None parsed_comment['published_date'] = parse_datetime( c['timestamp']['text']) if 'public_replies' in c: parsed_comment['reply_count'] = len([ x for x in comments if 'targetID' in x and x['targetID'] == c['id'] ]) else: parsed_comment['reply_count'] = 0 if c['id'] in reply_dic: parsed_comment['reply_to'] = reply_dic[c['id']] else: parsed_comment['reply_to'] = None yield parsed_comment #Stats stat = Stats() stat['domain'] = self.domain stat['url'] = response.url stat['views'] = None stat['likes'] = None if comments is None: stat['comments'] = 0 else: stat['comments'] = len(comments) yield stat
def get_date(date_string): date_string = date_string.strip() date_string = date_string.replace('Posted on ', '') date_string = date_string.replace('Last updated on ', '') return parse_datetime(date_string)