Ejemplo n.º 1
0
def _parse_comment(soup):
    c = {
        'author': '',
        'authorLink': '',
        'score': 0,
        'date': 0,
        'text': '',
        'comments': [],
    }
    
    if soup.find('link'):
        title = _soup_get_text(soup.find('link'))
        if title and 'By:' in title:
            c['author'] = title.strip('By:').strip()
    if soup.find('dc:creator'):
        c['author'] = _soup_get_text(soup.find('dc:creator'))
    if soup.find('link'):
        c['authorLink'] = _soup_get_text(soup.find('link'))
    if soup.find('description'):
        c['text'] = clean(_soup_get_text(soup.find('description')))
    if soup.find('pubdate'):
        c['date'] = unix(soup.find('pubdate').text)
    elif soup.find('pubDate'):
        c['date'] = unix(soup.find('pubDate').text)

    return c
Ejemplo n.º 2
0
    def feed(self):
        too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
        stories = json(lambda x: api_stories(x, self.BASE_DOMAIN),
                       headers={'Referer': self.BASE_DOMAIN})
        if not stories: return []
        stories = list(
            filter(None, [
                i if i.get("audience") == "everyone" else None for i in stories
            ]))
        stories = list(
            filter(None, [
                i if unix(i.get('post_date')) > too_old else None
                for i in stories
            ]))
        stories.sort(key=lambda a: unix(a.get('post_date')), reverse=True)

        return [self.ref_prefix(str(i.get("id"))) for i in stories or []]
Ejemplo n.º 3
0
def _filter_links(links, excludes=None):
    too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
    links = list(
        filter(None, [a if _get_sitemap_date(a) else None for a in links]))
    links = list(
        filter(None, [
            a if unix(_get_sitemap_date(a)) > too_old else None for a in links
        ]))
    links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)

    links = [x.find('loc').text for x in links] or []
    links = list(set(links))
    if excludes:
        links = list(
            filter(None, [
                None if any(e in link for e in excludes) else link
                for link in links
            ]))
    return links
Ejemplo n.º 4
0
    def story(self, ref, urlref, is_manual=False):
        if urlref is None:
            return False
        markup = xml(lambda x: urlref)
        if not markup:
            return False

        s = {}
        s['author'] = ''
        s['author_link'] = ''
        s['score'] = 0
        s['comments'] = []
        s['num_comments'] = 0
        s['link'] = urlref
        s['url'] = urlref
        s['date'] = 0
        s['title'] = ''

        icons = get_icons(markup, url=urlref)
        if icons:
            s['icon'] = icons[0]

        try:
            data = extruct.extract(markup)
            s = parse_extruct(s, data)
        except KeyboardInterrupt:
            raise
        except Exception as e:
            logging.error(e)

        if s['title']:
            s['title'] = clean(s['title'])
        if s['date']:
            s['date'] = unix(s['date'], tz=self.tz)

        if 'disqus' in markup:
            try:
                s['comments'] = headless.get_comments(urlref)
            except KeyboardInterrupt:
                raise
            except Exception as e:
                logging.error(e)

        if urlref.startswith('https://www.stuff.co.nz'):
            s['comments'] = stuff.get_json_comments(urlref, markup)

        if s['comments']:
            s['comments'] = [clean_comment(c) for c in s['comments']]
            s['comments'] = list(filter(bool, s['comments']))
            s['num_comments'] = comment_count(s) - 1

        if not is_manual and not s['date']:
            return False
        return s
Ejemplo n.º 5
0
 def feed(self):
     too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
     stories = json(SUBSTACK_API_TOP_POSTS,
                    headers={'Referer': SUBSTACK_REFERER})
     if not stories: return []
     stories = list(
         filter(None, [
             i if i.get("audience") == "everyone" else None for i in stories
         ]))
     stories = list(
         filter(None, [
             i if unix(i.get('post_date')) > too_old else None
             for i in stories
         ]))
     stories.sort(key=lambda a: unix(a.get('post_date')), reverse=True)
     stories = [
         self.ref_prefix(str(i.get("pub").get("base_url")),
                         str(i.get("id"))) for i in stories
     ]
     return stories
Ejemplo n.º 6
0
    def story(self, ref):
        ref = self.strip_ref_prefix(ref)
        stories = json(lambda x: api_stories(x, self.BASE_DOMAIN),
                       headers={'Referer': self.BASE_DOMAIN})
        if not stories: return False
        stories = list(
            filter(None, [
                i if i.get("audience") == "everyone" else None for i in stories
            ]))
        stories = list(
            filter(None,
                   [i if str(i.get('id')) == ref else None for i in stories]))

        if len(stories) == 0:
            return False

        r = stories[0]
        if not r:
            return False

        s = {}
        s['author'] = ''
        s['author_link'] = ''

        s['date'] = unix(r.get('post_date'))
        s['score'] = r.get('reactions').get('❤')
        s['title'] = r.get('title', '')
        s['link'] = r.get('canonical_url', '')
        s['url'] = r.get('canonical_url', '')
        comments = json(lambda x: api_comments(x, self.BASE_DOMAIN),
                        r.get('id'),
                        headers={'Referer': self.BASE_DOMAIN})
        s['comments'] = [] if not comments else [
            comment(i) for i in comments.get('comments')
        ]
        s['comments'] = list(filter(bool, s['comments']))
        s['num_comments'] = r.get('comment_count', 0)

        authors = list(
            filter(None, [
                self._bylines(byline) for byline in r.get('publishedBylines')
            ]))
        if len(authors):
            s['author'] = authors[0].get('name')
            s['author_link'] = authors[0].get('link')

        markup = xml(lambda x: s['link'])
        if markup:
            icons = get_icons(markup, url=s['link'])
            if icons:
                s['icon'] = icons[0]

        return s
Ejemplo n.º 7
0
def comment(i):
    if 'body' not in i:
        return False

    c = {}
    c['date'] = unix(i.get('date'))
    c['author'] = i.get('name', '')
    c['score'] = i.get('reactions').get('❤')
    c['text'] = clean(i.get('body', '') or '')
    c['comments'] = [comment(j) for j in i['children']]
    c['comments'] = list(filter(bool, c['comments']))

    return c
Ejemplo n.º 8
0
    def story(self, ref):
        ref = self.strip_ref_prefix(ref)
        stories = json(SUBSTACK_API_TOP_POSTS,
                       headers={'Referer': SUBSTACK_REFERER})
        if not stories: return False
        stories = list(
            filter(None, [
                i if i.get("audience") == "everyone" else None for i in stories
            ]))
        stories = list(
            filter(None,
                   [i if str(i.get('id')) == ref else None for i in stories]))

        if len(stories) == 0:
            return False

        r = stories[0]
        if not r:
            return False

        s = {}
        pub = r.get('pub')
        base_url = pub.get('base_url')
        s['author'] = pub.get('author_name')
        s['author_link'] = author_link(pub.get('author_id'), base_url)

        s['date'] = unix(r.get('post_date'))
        s['score'] = r.get('score')
        s['title'] = r.get('title', '')
        s['link'] = r.get('canonical_url', '')
        s['url'] = r.get('canonical_url', '')
        comments = json(lambda x: api_comments(x, base_url),
                        r.get('id'),
                        headers={'Referer': base_url})
        s['comments'] = [] if not comments else [
            comment(i) for i in comments.get('comments')
        ]
        s['comments'] = list(filter(bool, s['comments']))
        s['num_comments'] = r.get('comment_count', 0)

        return s
Ejemplo n.º 9
0
def clean_comment(comment):
    comment['text'] = clean(comment['text'])
    if isinstance(comment['date'], str):
        comment['date'] = unix(comment['date'])
    comment['comments'] = [clean_comment(c) for c in comment['comments']]
    return comment