コード例 #1
0
def parse_authors_date(data):
    if 'page' in data:
        if 'authors' in data['page'] and 'date' in data['page']:
            if data['page']['authors'] and data['page']['date']:
                try:
                    date = parse_datetime(data['page']['date']['pub'])
                except:
                    date = None

                author = ' and '.join(data['page']['authors'])
                if 'by' in author.lower() or 'special to' in author.lower():
                    author = ''.join(
                        author.lower().split("special to")).replace(
                            'by', '').strip().title()
                return author_title(author), date
    else:
        if 'author' in data and 'datePublished' in data:
            try:
                date = parse_datetime(data['datePublished'])
            except:
                date = None

            author = data['author']['name']
            if 'by' in author.lower() or 'special to' in author.lower():
                author = ''.join(author.lower().split("special to")).replace(
                    'by', '').strip().title()

            return author_title(author), date
コード例 #2
0
ファイル: tnc.py プロジェクト: leosj1/Blogcrawler
 def parse_blog(self, response):
     # Post
     blog = Posts()
     blog['domain'] = self.domain
     blog['url'] = response.url
     blog['title'] = response.css('.entry-title::text').extract_first()
     blog['author'] = response.xpath(
         '//*[contains(@id,"post-")]/div[1]/div/div/header/div/div/a//text()'
     ).extract_first()
     date = response.css('.updated::text').extract_first()
     blog['published_date'] = parse_datetime(date) if date else None
     #-Cleaning Post
     posts = "".join(
         response.xpath("//div[contains(@class, 'td-post-content')]//text()"
                        ).extract()).strip().replace('\n', ' ').replace(
                            '\t', ' ')
     posts = str(re.sub(' +', ' ', posts))
     blog['content'] = posts.replace(
         'Facebook Twitter reddit LinkedIn', ''
     ).replace(
         'We’re asking readers, like you, to make a contribution in support of True North’s fact-based, independent journalism.\r Unlike the mainstream media, True North isn’t getting a government bailout. Instead, we depend on the generosity of Canadians like you.\r How can a media outlet be trusted to remain neutral and fair if they’re beneficiaries of a government handout? We don’t think they can.\r This is why independent media in Canada is more important than ever. If you’re able, please make a tax-deductible donation to True North today. Thank you so much.',
         '').replace('\r', ' ')
     blog['content_html'] = "".join(
         response.xpath(
             "//div[contains(@class, 'td-post-content')]").extract())
     blog['links'] = get_links("".join(
         response.xpath(
             "//div[contains(@class, 'td-post-content')]").extract()))
     blog['tags'] = None
     yield blog
コード例 #3
0
ファイル: globalnews.py プロジェクト: leosj1/Blogcrawler
    def parse_blog(self, response):
        # Posts
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = parse_title(response) 

        date, author = parse_author_date(response)    
        author = parse_author(response)
        blog['author'] = str(author)[0:99] if author else None
        blog['published_date'] = parse_datetime(date) if date else None

        content, content_html = parse_content(response)
        if content_html:
            blog['content'] = content
            blog['content_html'] = content_html
            blog['links'] = get_links(content_html)
            tags = response.xpath('//*[@id="article-tags"]/div//text()').extract()
            blog['tags'] = tags_to_json(list(filter(lambda x: '\n' not in x and '\t' not in x, tags))) if tags else None
            yield blog

            # Comments
            comment_data = facebook_comments(response.url, 318812448281278)
            comments = comment_data['comments']
            authors = comment_data['authors']
            reply_dic = comment_data['reply_dic']
            if comments: #Catches no comments
                for c in comments: 
                    parsed_comment = Comments()
                    parsed_comment['domain'] = self.domain
                    parsed_comment['url'] = response.url
                    parsed_comment['comment_id'] = c['id']
                    parsed_comment['username'] = [x['name'] for x in authors if c['authorID'] == x['id']][0]
                    parsed_comment['user_id'] = c['authorID']
                    parsed_comment['comment'] = c['body']['text']
                    parsed_comment['comment_original'] = None
                    parsed_comment['links'] = get_links(c['body']['text'])
                    parsed_comment['upvotes'] = c['likeCount']
                    parsed_comment['downvotes'] = None
                    parsed_comment['published_date'] = parse_datetime(c['timestamp']['text'])
                    if 'public_replies' in c:
                        parsed_comment['reply_count'] = len([x for x in comments if 'targetID' in x and x['targetID'] == c['id']])
                    else:
                        parsed_comment['reply_count'] = 0
                    if c['id'] in reply_dic:
                        parsed_comment['reply_to'] = reply_dic[c['id']]
                    else:
                        parsed_comment['reply_to'] = None
                    yield parsed_comment

            # #Stats
            stat = Stats()
            stat['domain'] = self.domain
            stat['url'] = response.url
            stat['views'] = None
            stat['likes'] = None
            if comments is None: 
                stat['comments'] = 0
            else:
                stat['comments'] = len(comments) 
            yield stat
コード例 #4
0
    def parse_blog(self, response):
        # Posts
        script = response.xpath(
            "//script[contains(., 'identity')]/text()").extract_first()
        try:
            data = json.loads(script)
        except Exception as e:
            print(str(e) + f"\n{str(response.url)}")
            data = {}

        if data:
            blog = Posts()
            blog['domain'] = get_domain(response.url)
            blog['url'] = response.url
            blog['title'] = response.css(
                '.article-title::text').extract_first()
            author, date = parse_authors_date(data) if data else None
            blog['author'] = author.replace(" ",
                                            "").strip() if author else None
            blog['published_date'] = date if date else None
            blog['tags'] = tags_to_json(
                data['page']['tags']) if 'page' in data else None
            blog['content'] = get_content(response)
            blog['content_html'] = " ".join(
                response.xpath('//*[@class="article-content"]').extract())
            blog['links'] = get_links(" ".join(
                response.xpath('//*[@class="article-content"]').extract()))
            yield blog
        else:
            print('here')
            pass

        # Comments
        article_id = data['page']['articleId'] if 'page' in data else None
        comments = get_torontosun_comments(article_id)
        if comments:  #Catches no comments
            for c in comments:
                if 'content' in c and c['content']:  #Skipping empty comments
                    parsed_comment = Comments()
                    parsed_comment['domain'] = self.domain
                    parsed_comment['url'] = response.url
                    parsed_comment['comment_id'] = c['content_uuid']
                    parsed_comment[
                        'username'] = None  #Could not find API to get this
                    parsed_comment['user_id'] = c['actor_uuid']
                    parsed_comment[
                        'comment'] = c['content'] if 'content' in c else None
                    parsed_comment['comment_original'] = None
                    parsed_comment['links'] = get_links(
                        c['content']) if 'content' in c else None
                    parsed_comment['upvotes'] = c['total_likes']
                    parsed_comment['downvotes'] = c['total_dislikes']
                    parsed_comment['published_date'] = parse_datetime(
                        time.strftime('%m/%d/%Y %H:%M:%S',
                                      time.gmtime(c['date_created'] / 1000.)))
                    if c['total_replies'] > 0:
                        parsed_comment['reply_count'] = c['total_replies']
                    else:
                        parsed_comment['reply_count'] = 0
                    if c['content_container_uuid'] != c['thread_uuid'] and c[
                            'content_container_uuid'] != c['parent_uuid']:
                        parsed_comment['reply_to'] = c['thread_uuid']
                    else:
                        parsed_comment['reply_to'] = None
                    yield parsed_comment

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = None
        if comments is None:
            stat['comments'] = 0
        else:
            stat['comments'] = len(comments)
        yield stat
コード例 #5
0
    def parse_article(self, response):
        blog = Posts()
        blog['domain'] = self.domain
        blog['url'] = response.url
        blog['title'] = response.xpath(
            '//*[@class="news-article-header__title"]/text()').get()
        blog['author'] = response.xpath(
            '//*[@class="news-byline-full__info-wrapper"]/span/text()').get()
        blog['published_date'] = get_date(
            response.xpath(
                '//*[@class="news-article-header__timestamps-posted"]/text()').
            get())
        blog['content'] = " ".join(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/div[1]/div/p/text()').
            getall()).strip()
        blog['content_html'] = " ".join(
            response.xpath(
                '//*[@id="js-post-container"]/div/div[1]/div[1]/div').getall())
        blog['links'] = get_links(blog['content_html'])
        blog['tags'] = tags_to_json(parse_tags(response))
        yield blog

        #Comments requests
        article_url = format_comment_url(response.url)
        comment_data = facebook_comments(article_url, '162111247988300')
        comments = comment_data['comments']
        authors = comment_data['authors']
        reply_dic = comment_data['reply_dic']
        if comments:  #Catches no comments
            for c in comments:
                parsed_comment = Comments()
                parsed_comment['domain'] = self.domain
                parsed_comment['url'] = response.url
                parsed_comment['comment_id'] = c['id']
                parsed_comment['username'] = [
                    x['name'] for x in authors if c['authorID'] == x['id']
                ][0]
                parsed_comment['user_id'] = c['authorID']
                parsed_comment['comment'] = c['body']['text']
                parsed_comment['comment_original'] = None
                parsed_comment['links'] = get_links(c['body']['text'])
                parsed_comment['upvotes'] = c['likeCount']
                parsed_comment['downvotes'] = None
                parsed_comment['published_date'] = parse_datetime(
                    c['timestamp']['text'])
                if 'public_replies' in c:
                    parsed_comment['reply_count'] = len([
                        x for x in comments
                        if 'targetID' in x and x['targetID'] == c['id']
                    ])
                else:
                    parsed_comment['reply_count'] = 0
                if c['id'] in reply_dic:
                    parsed_comment['reply_to'] = reply_dic[c['id']]
                else:
                    parsed_comment['reply_to'] = None
                yield parsed_comment

        #Stats
        stat = Stats()
        stat['domain'] = self.domain
        stat['url'] = response.url
        stat['views'] = None
        stat['likes'] = None
        if comments is None:
            stat['comments'] = 0
        else:
            stat['comments'] = len(comments)
        yield stat
コード例 #6
0
def get_date(date_string):
    date_string = date_string.strip()
    date_string = date_string.replace('Posted on ', '')
    date_string = date_string.replace('Last updated on ', '')
    return parse_datetime(date_string)