def _parse_comment(soup): c = { 'author': '', 'authorLink': '', 'score': 0, 'date': 0, 'text': '', 'comments': [], } if soup.find('link'): title = _soup_get_text(soup.find('link')) if title and 'By:' in title: c['author'] = title.strip('By:').strip() if soup.find('dc:creator'): c['author'] = _soup_get_text(soup.find('dc:creator')) if soup.find('link'): c['authorLink'] = _soup_get_text(soup.find('link')) if soup.find('description'): c['text'] = clean(_soup_get_text(soup.find('description'))) if soup.find('pubdate'): c['date'] = unix(soup.find('pubdate').text) elif soup.find('pubDate'): c['date'] = unix(soup.find('pubDate').text) return c
def feed(self): too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN}) if not stories: return [] stories = list( filter(None, [ i if i.get("audience") == "everyone" else None for i in stories ])) stories = list( filter(None, [ i if unix(i.get('post_date')) > too_old else None for i in stories ])) stories.sort(key=lambda a: unix(a.get('post_date')), reverse=True) return [self.ref_prefix(str(i.get("id"))) for i in stories or []]
def _filter_links(links, excludes=None): too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE links = list( filter(None, [a if _get_sitemap_date(a) else None for a in links])) links = list( filter(None, [ a if unix(_get_sitemap_date(a)) > too_old else None for a in links ])) links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True) links = [x.find('loc').text for x in links] or [] links = list(set(links)) if excludes: links = list( filter(None, [ None if any(e in link for e in excludes) else link for link in links ])) return links
def story(self, ref, urlref, is_manual=False): if urlref is None: return False markup = xml(lambda x: urlref) if not markup: return False s = {} s['author'] = '' s['author_link'] = '' s['score'] = 0 s['comments'] = [] s['num_comments'] = 0 s['link'] = urlref s['url'] = urlref s['date'] = 0 s['title'] = '' icons = get_icons(markup, url=urlref) if icons: s['icon'] = icons[0] try: data = extruct.extract(markup) s = parse_extruct(s, data) except KeyboardInterrupt: raise except Exception as e: logging.error(e) if s['title']: s['title'] = clean(s['title']) if s['date']: s['date'] = unix(s['date'], tz=self.tz) if 'disqus' in markup: try: s['comments'] = headless.get_comments(urlref) except KeyboardInterrupt: raise except Exception as e: logging.error(e) if urlref.startswith('https://www.stuff.co.nz'): s['comments'] = stuff.get_json_comments(urlref, markup) if s['comments']: s['comments'] = [clean_comment(c) for c in s['comments']] s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = comment_count(s) - 1 if not is_manual and not s['date']: return False return s
def feed(self): too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER}) if not stories: return [] stories = list( filter(None, [ i if i.get("audience") == "everyone" else None for i in stories ])) stories = list( filter(None, [ i if unix(i.get('post_date')) > too_old else None for i in stories ])) stories.sort(key=lambda a: unix(a.get('post_date')), reverse=True) stories = [ self.ref_prefix(str(i.get("pub").get("base_url")), str(i.get("id"))) for i in stories ] return stories
def story(self, ref): ref = self.strip_ref_prefix(ref) stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN}) if not stories: return False stories = list( filter(None, [ i if i.get("audience") == "everyone" else None for i in stories ])) stories = list( filter(None, [i if str(i.get('id')) == ref else None for i in stories])) if len(stories) == 0: return False r = stories[0] if not r: return False s = {} s['author'] = '' s['author_link'] = '' s['date'] = unix(r.get('post_date')) s['score'] = r.get('reactions').get('❤') s['title'] = r.get('title', '') s['link'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '') comments = json(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), headers={'Referer': self.BASE_DOMAIN}) s['comments'] = [] if not comments else [ comment(i) for i in comments.get('comments') ] s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = r.get('comment_count', 0) authors = list( filter(None, [ self._bylines(byline) for byline in r.get('publishedBylines') ])) if len(authors): s['author'] = authors[0].get('name') s['author_link'] = authors[0].get('link') markup = xml(lambda x: s['link']) if markup: icons = get_icons(markup, url=s['link']) if icons: s['icon'] = icons[0] return s
def comment(i): if 'body' not in i: return False c = {} c['date'] = unix(i.get('date')) c['author'] = i.get('name', '') c['score'] = i.get('reactions').get('❤') c['text'] = clean(i.get('body', '') or '') c['comments'] = [comment(j) for j in i['children']] c['comments'] = list(filter(bool, c['comments'])) return c
def story(self, ref): ref = self.strip_ref_prefix(ref) stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER}) if not stories: return False stories = list( filter(None, [ i if i.get("audience") == "everyone" else None for i in stories ])) stories = list( filter(None, [i if str(i.get('id')) == ref else None for i in stories])) if len(stories) == 0: return False r = stories[0] if not r: return False s = {} pub = r.get('pub') base_url = pub.get('base_url') s['author'] = pub.get('author_name') s['author_link'] = author_link(pub.get('author_id'), base_url) s['date'] = unix(r.get('post_date')) s['score'] = r.get('score') s['title'] = r.get('title', '') s['link'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '') comments = json(lambda x: api_comments(x, base_url), r.get('id'), headers={'Referer': base_url}) s['comments'] = [] if not comments else [ comment(i) for i in comments.get('comments') ] s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = r.get('comment_count', 0) return s
def clean_comment(comment): comment['text'] = clean(comment['text']) if isinstance(comment['date'], str): comment['date'] = unix(comment['date']) comment['comments'] = [clean_comment(c) for c in comment['comments']] return comment