コード例 #1
0
ファイル: stuff.py プロジェクト: master5o1/qotnews
def get_json_comments(url, markup=None):
    regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+"
    p = re.compile(regex).match(url)
    if not p: return []
    path = p.groups()[0]
    if not markup:
        markup = xml(lambda x: url)
    soup = BeautifulSoup(markup, features='html.parser')
    scripts = soup.find_all('script', src=True)
    scripts = list(filter(None, [s if s['src'].startswith("https://cdns.gigya.com/JS/gigya.js?apiKey=") else None for s in scripts]))
    if not scripts: return []
    script = scripts[0]
    if not script: return []
    meh, params = script['src'].split('?', maxsplit=1)
    params = params.split('&')
    params = [p.split('=') for p in params]
    params = list(filter(None, [value if name.lower() == 'apikey' else None for name, value in params]))
    if not params: return []
    apiKey = params[0]
    if not apiKey: return []
    url = f"https://comments.us1.gigya.com/comments.getComments?threaded=true&format=json&categoryID=Stuff&streamID=stuff/{path}&APIKey={apiKey}"
    data = json(lambda x: url)
    comments = data.get('comments', [])
    comments = [_parse_json_comment(c) for c in comments]
    return comments
コード例 #2
0
def _get_category(category_url, excludes=None):
    base_url = '/'.join(category_url.split('/')[:3])
    markup = xml(lambda x: category_url)
    if not markup: return []
    soup = BeautifulSoup(markup, features='html.parser')
    links = soup.find_all('a', href=True)
    links = [link.get('href') for link in links]
    links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
    links = _filter_links(links, category_url, excludes)
    return links
コード例 #3
0
ファイル: news.py プロジェクト: master5o1/qotnews
    def story(self, ref, urlref, is_manual=False):
        if urlref is None:
            return False
        markup = xml(lambda x: urlref)
        if not markup:
            return False

        s = {}
        s['author'] = ''
        s['author_link'] = ''
        s['score'] = 0
        s['comments'] = []
        s['num_comments'] = 0
        s['link'] = urlref
        s['url'] = urlref
        s['date'] = 0
        s['title'] = ''

        icons = get_icons(markup, url=urlref)
        if icons:
            s['icon'] = icons[0]

        try:
            data = extruct.extract(markup)
            s = parse_extruct(s, data)
        except KeyboardInterrupt:
            raise
        except Exception as e:
            logging.error(e)

        if s['title']:
            s['title'] = clean(s['title'])
        if s['date']:
            s['date'] = unix(s['date'], tz=self.tz)

        if 'disqus' in markup:
            try:
                s['comments'] = headless.get_comments(urlref)
            except KeyboardInterrupt:
                raise
            except Exception as e:
                logging.error(e)

        if urlref.startswith('https://www.stuff.co.nz'):
            s['comments'] = stuff.get_json_comments(urlref, markup)

        if s['comments']:
            s['comments'] = [clean_comment(c) for c in s['comments']]
            s['comments'] = list(filter(bool, s['comments']))
            s['num_comments'] = comment_count(s) - 1

        if not is_manual and not s['date']:
            return False
        return s
コード例 #4
0
ファイル: stuff.py プロジェクト: master5o1/qotnews
def get_rss_comments(url):
    regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+"
    p = re.compile(regex).match(url)
    path = p.groups()[0]
    comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}'
    markup = xml(lambda x: comment_url)
    if not markup: return []
    soup = BeautifulSoup(markup, features='html.parser')
    comments = soup.find_all('item')
    if not comments: return []
    comments = [_parse_comment(c) for c in comments]
    return comments
コード例 #5
0
    def story(self, ref):
        ref = self.strip_ref_prefix(ref)
        stories = json(lambda x: api_stories(x, self.BASE_DOMAIN),
                       headers={'Referer': self.BASE_DOMAIN})
        if not stories: return False
        stories = list(
            filter(None, [
                i if i.get("audience") == "everyone" else None for i in stories
            ]))
        stories = list(
            filter(None,
                   [i if str(i.get('id')) == ref else None for i in stories]))

        if len(stories) == 0:
            return False

        r = stories[0]
        if not r:
            return False

        s = {}
        s['author'] = ''
        s['author_link'] = ''

        s['date'] = unix(r.get('post_date'))
        s['score'] = r.get('reactions').get('❤')
        s['title'] = r.get('title', '')
        s['link'] = r.get('canonical_url', '')
        s['url'] = r.get('canonical_url', '')
        comments = json(lambda x: api_comments(x, self.BASE_DOMAIN),
                        r.get('id'),
                        headers={'Referer': self.BASE_DOMAIN})
        s['comments'] = [] if not comments else [
            comment(i) for i in comments.get('comments')
        ]
        s['comments'] = list(filter(bool, s['comments']))
        s['num_comments'] = r.get('comment_count', 0)

        authors = list(
            filter(None, [
                self._bylines(byline) for byline in r.get('publishedBylines')
            ]))
        if len(authors):
            s['author'] = authors[0].get('name')
            s['author_link'] = authors[0].get('link')

        markup = xml(lambda x: s['link'])
        if markup:
            icons = get_icons(markup, url=s['link'])
            if icons:
                s['icon'] = icons[0]

        return s
コード例 #6
0
def _get_sitemap(feed_url, excludes=None):
    markup = xml(lambda x: feed_url)
    if not markup: return []
    soup = BeautifulSoup(markup, features='lxml')
    links = []
    feed_urls = []
    if soup.find('sitemapindex'):
        sitemap = soup.find('sitemapindex').findAll('sitemap')
        feed_urls = list(
            filter(None, [a if a.find('loc') else None for a in sitemap]))
    if soup.find('urlset'):
        sitemap = soup.find('urlset').findAll('url')
        links = list(
            filter(None, [a if a.find('loc') else None for a in sitemap]))

    feed_urls = _filter_links(feed_urls, excludes)
    links = _filter_links(links, excludes)

    for url in feed_urls:
        links += _get_sitemap(url, excludes)
    return list(set(links))