def get_json_comments(url, markup=None): regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+" p = re.compile(regex).match(url) if not p: return [] path = p.groups()[0] if not markup: markup = xml(lambda x: url) soup = BeautifulSoup(markup, features='html.parser') scripts = soup.find_all('script', src=True) scripts = list(filter(None, [s if s['src'].startswith("https://cdns.gigya.com/JS/gigya.js?apiKey=") else None for s in scripts])) if not scripts: return [] script = scripts[0] if not script: return [] meh, params = script['src'].split('?', maxsplit=1) params = params.split('&') params = [p.split('=') for p in params] params = list(filter(None, [value if name.lower() == 'apikey' else None for name, value in params])) if not params: return [] apiKey = params[0] if not apiKey: return [] url = f"https://comments.us1.gigya.com/comments.getComments?threaded=true&format=json&categoryID=Stuff&streamID=stuff/{path}&APIKey={apiKey}" data = json(lambda x: url) comments = data.get('comments', []) comments = [_parse_json_comment(c) for c in comments] return comments
def _get_category(category_url, excludes=None): base_url = '/'.join(category_url.split('/')[:3]) markup = xml(lambda x: category_url) if not markup: return [] soup = BeautifulSoup(markup, features='html.parser') links = soup.find_all('a', href=True) links = [link.get('href') for link in links] links = [f"{base_url}{link}" if link.startswith('/') else link for link in links] links = _filter_links(links, category_url, excludes) return links
def story(self, ref, urlref, is_manual=False): if urlref is None: return False markup = xml(lambda x: urlref) if not markup: return False s = {} s['author'] = '' s['author_link'] = '' s['score'] = 0 s['comments'] = [] s['num_comments'] = 0 s['link'] = urlref s['url'] = urlref s['date'] = 0 s['title'] = '' icons = get_icons(markup, url=urlref) if icons: s['icon'] = icons[0] try: data = extruct.extract(markup) s = parse_extruct(s, data) except KeyboardInterrupt: raise except Exception as e: logging.error(e) if s['title']: s['title'] = clean(s['title']) if s['date']: s['date'] = unix(s['date'], tz=self.tz) if 'disqus' in markup: try: s['comments'] = headless.get_comments(urlref) except KeyboardInterrupt: raise except Exception as e: logging.error(e) if urlref.startswith('https://www.stuff.co.nz'): s['comments'] = stuff.get_json_comments(urlref, markup) if s['comments']: s['comments'] = [clean_comment(c) for c in s['comments']] s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = comment_count(s) - 1 if not is_manual and not s['date']: return False return s
def get_rss_comments(url): regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+" p = re.compile(regex).match(url) path = p.groups()[0] comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}' markup = xml(lambda x: comment_url) if not markup: return [] soup = BeautifulSoup(markup, features='html.parser') comments = soup.find_all('item') if not comments: return [] comments = [_parse_comment(c) for c in comments] return comments
def story(self, ref): ref = self.strip_ref_prefix(ref) stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN}) if not stories: return False stories = list( filter(None, [ i if i.get("audience") == "everyone" else None for i in stories ])) stories = list( filter(None, [i if str(i.get('id')) == ref else None for i in stories])) if len(stories) == 0: return False r = stories[0] if not r: return False s = {} s['author'] = '' s['author_link'] = '' s['date'] = unix(r.get('post_date')) s['score'] = r.get('reactions').get('❤') s['title'] = r.get('title', '') s['link'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '') comments = json(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), headers={'Referer': self.BASE_DOMAIN}) s['comments'] = [] if not comments else [ comment(i) for i in comments.get('comments') ] s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = r.get('comment_count', 0) authors = list( filter(None, [ self._bylines(byline) for byline in r.get('publishedBylines') ])) if len(authors): s['author'] = authors[0].get('name') s['author_link'] = authors[0].get('link') markup = xml(lambda x: s['link']) if markup: icons = get_icons(markup, url=s['link']) if icons: s['icon'] = icons[0] return s
def _get_sitemap(feed_url, excludes=None): markup = xml(lambda x: feed_url) if not markup: return [] soup = BeautifulSoup(markup, features='lxml') links = [] feed_urls = [] if soup.find('sitemapindex'): sitemap = soup.find('sitemapindex').findAll('sitemap') feed_urls = list( filter(None, [a if a.find('loc') else None for a in sitemap])) if soup.find('urlset'): sitemap = soup.find('urlset').findAll('url') links = list( filter(None, [a if a.find('loc') else None for a in sitemap])) feed_urls = _filter_links(feed_urls, excludes) links = _filter_links(links, excludes) for url in feed_urls: links += _get_sitemap(url, excludes) return list(set(links))