def render(self, src, container='div'): can_upvote = False can_downvote = False if self.current_user is not None: prev_vote = self.handler.session.query(Action).filter_by( what='src_vote', user_id=self.current_user.id, source=src).first() if prev_vote is None or prev_vote.value > 0: can_downvote = True if prev_vote is None or prev_vote.value < 0: can_upvote = True title = src.title if not title: if src.kind == SourceKind.PR: title = "Press release (%s)" % (util.domain(src.url), ) if src.kind == SourceKind.OTHER: title = "Other link (%s)" % (util.domain(src.url), ) return self.render_string("modules/source.html", id=src.id, title=title, kind=src.kind, url=src.url, publication=src.publication, pubdate=src.pubdate, doi=src.doi, creator=src.creator, score=src.score, can_upvote=can_upvote, can_downvote=can_downvote, container=container)
def render(self, src, container='div'): can_upvote = False can_downvote = False if self.current_user is not None: prev_vote = self.handler.session.query(Action).filter_by(what='src_vote',user_id=self.current_user.id, source=src).first() if prev_vote is None or prev_vote.value>0: can_downvote = True if prev_vote is None or prev_vote.value<0: can_upvote = True title=src.title if not title: if src.kind==SourceKind.PR: title = "Press release (%s)" % (util.domain(src.url),) if src.kind==SourceKind.OTHER: title = "Other link (%s)" % (util.domain(src.url),) return self.render_string("modules/source.html", id=src.id, title=title, kind=src.kind, url=src.url, publication=src.publication, pubdate=src.pubdate, doi=src.doi, creator=src.creator, score=src.score, can_upvote=can_upvote, can_downvote=can_downvote, container=container, kind_desc=source_presentation[src.kind]['desc'], kind_icon='/static/' + source_presentation[src.kind]['icon'])
def parse_bookmarks_export(html_file): """Parse netscape-format bookmarks export files (produced by all browsers)""" html_file.seek(0) pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE) for line in html_file: # example line # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A> match = pattern.search(line) if match: url = match.group(1) time = datetime.fromtimestamp(float(match.group(2))) info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': "", 'title': match.group(3), 'sources': [html_file.name], } info['type'] = get_link_type(info) yield info
def parse_pinboard_rss_feed(rss_file): """Parse Pinboard RSS feed files into links""" rss_file.seek(0) root = etree.parse(rss_file).getroot() items = root.findall("{http://purl.org/rss/1.0/}item") for item in items: url = item.find("{http://purl.org/rss/1.0/}link").text tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text title = item.find("{http://purl.org/rss/1.0/}title").text ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text # = 🌈🌈🌈🌈 # = 🌈🌈🌈🌈 # = 🏆🏆🏆🏆 # Pinboard includes a colon in its date stamp timezone offsets, which # Python can't parse. Remove it: if ":" == ts_str[-3:-2]: ts_str = ts_str[:-3] + ts_str[-2:] time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': tags, 'title': title, 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_pocket_export(html_file): """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" html_file.seek(0) pattern = re.compile( "^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE) for line in html_file: # example line # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li> match = pattern.search(line) if match: fixed_url = match.group(1).replace( 'http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url time = datetime.fromtimestamp(float(match.group(2))) info = { 'url': fixed_url, 'domain': domain(fixed_url), 'base_url': base_url(fixed_url), 'timestamp': str(time.timestamp()), 'tags': match.group(3), 'title': match.group(4).replace(' — Readability', '').replace( 'http://www.readability.com/read?url=', '') or base_url(fixed_url), 'sources': [html_file.name], } info['type'] = get_link_type(info) yield info
def fetch_favicon(link_dir, link, timeout=TIMEOUT): """download site favicon from google's favicon api""" output = 'favicon.ico' cmd = [ CURL_BINARY, '--max-time', str(timeout), '--location', '--output', output, *(() if CHECK_SSL_VALIDITY else ('--insecure',)), 'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])), ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) chmod_file(output, cwd=link_dir) except Exception as err: status = 'failed' output = err finally: timer.end() return { 'cmd': cmd, 'pwd': link_dir, 'output': output, 'status': status, **timer.stats, }
def fetch_favicon(link_dir, link, timeout=TIMEOUT): """download site favicon from google's favicon api""" output = 'favicon.ico' if os.path.exists(os.path.join(link_dir, output)): return {'output': output, 'status': 'skipped'} CMD = [ CURL_BINARY, '--max-time', str(timeout), '--location', '--output', output, *(() if CHECK_SSL_VALIDITY else ('--insecure', )), 'https://www.google.com/s2/favicons?domain={}'.format( domain(link['url'])), ] end = progress(timeout, prefix=' ') try: run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) end() chmod_file('favicon.ico', cwd=link_dir) output = 'favicon.ico' except Exception as e: end() output = e print_error_hints(cmd=CMD, pwd=link_dir, err=e) return { 'cmd': CMD, 'output': output, }
def fetch_git(link_dir, link, timeout=TIMEOUT): """download full site using git""" url_is_clonable = (domain(link['url']) in GIT_DOMAINS or link['url'].endswith('.git') or link['type'] == 'git') if not url_is_clonable: return {'output': None, 'status': 'skipped'} git_dir = os.path.join(link_dir, 'git') if os.path.exists(git_dir): return {'output': 'git', 'status': 'skipped'} os.makedirs(git_dir, exist_ok=True) output = 'git' CMD = [ GIT_BINARY, 'clone', '--mirror', '--recursive', *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')), without_query(without_fragment(link['url'])), ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=git_dir, timeout=timeout + 1) # git/<reponame> end() if result.returncode == 128: # ignore failed re-download when the folder already exists pass elif result.returncode > 0: print(' got git response code {}:'.format( result.returncode)) raise Exception('Failed git download') except Exception as e: end() print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) print(' Run to see full output:') print(' cd {};'.format(link_dir)) print(' {}'.format(' '.join(CMD))) output = e return { 'cmd': CMD, 'output': output, }
def basic_link_info(url, f, title=None, time=datetime.now(), tags=""): info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': tags, 'title': title, 'sources': [f.name], } info['type'] = get_link_type(info) return info
def should_fetch_git(link_dir, link): if is_static_file(link['url']): return False if os.path.exists(os.path.join(link_dir, 'git')): return False is_clonable_url = ((domain(link['url']) in GIT_DOMAINS) or (extension(link['url']) == 'git')) if not is_clonable_url: return False return FETCH_GIT
def parse_atom_export(rss_file): """Parse Atom XML-format files into links""" rss_file.seek(0) rss_data = rss_file.read() d = feedparser.parse(rss_data) for item in d.entries: info = { 'url': item.link, 'domain': domain(item.link), 'base_url': base_url(item.link), 'timestamp': str(mktime(item.published_parsed)), 'tags': '', 'title': item.title, 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_shaarli_rss_export(rss_file): """Parse Shaarli-specific RSS XML-format files into links""" rss_file.seek(0) entries = rss_file.read().split('<entry>')[1:] for entry in entries: # example entry: # <entry> # <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title> # <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" /> # <id>https://demo.shaarli.org/?cEV4vw</id> # <published>2019-01-30T06:06:01+00:00</published> # <updated>2019-01-30T06:06:01+00:00</updated> # <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>— <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content> # </entry> trailing_removed = entry.split('</entry>', 1)[0] leading_removed = trailing_removed.strip() rows = leading_removed.split('\n') def get_row(key): return [ r.strip() for r in rows if r.strip().startswith('<{}'.format(key)) ][0] title = str_between(get_row('title'), '<title>', '</title>').strip() url = str_between(get_row('link'), '<link href="', '" />') ts_str = str_between(get_row('published'), '<published>', '</published>') time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or fetch_page_title(url), 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_json_export(json_file): """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" json_file.seek(0) json_content = json.load(json_file) for line in json_content: # example line # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if line: erg = line if erg.get('timestamp'): timestamp = str( erg['timestamp'] / 10000000 ) # chrome/ff histories use a very precise timestamp elif erg.get('time'): timestamp = str( datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp()) elif erg.get('created_at'): timestamp = str( datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp()) else: timestamp = str(datetime.now().timestamp()) if erg.get('href'): url = erg['href'] else: url = erg['url'] if erg.get('description'): title = (erg.get('description') or '').replace(' — Readability', '') else: title = erg.get('title') or base_url(url) info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': timestamp, 'tags': erg.get('tags') or '', 'title': title, 'sources': [json_file.name], } info['type'] = get_link_type(info) yield info
def parse_rss_export(rss_file): """Parse RSS XML-format files into links""" rss_file.seek(0) items = rss_file.read().split('</item>\n<item>') for item in items: # example item: # <item> # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title> # <category>Unread</category> # <link>https://blog.sessionstack.com/how-javascript-works-inside</link> # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid> # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate> # </item> trailing_removed = item.split('</item>', 1)[0] leading_removed = trailing_removed.split('<item>', 1)[-1] rows = leading_removed.split('\n') def get_row(key): return [ r for r in rows if r.strip().startswith('<{}>'.format(key)) ][0] title = str_between(get_row('title'), '<![CDATA[', ']]').strip() url = str_between(get_row('link'), '<link>', '</link>') ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>') time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or fetch_page_title(url), 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_plain_text(text_file): """Parse raw links from each line in a text file""" text_file.seek(0) text_content = text_file.readlines() for line in text_content: if line: urls = re.findall(URL_REGEX, line) for url in urls: info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(datetime.now().timestamp()), 'tags': '', 'title': fetch_page_title(url), 'sources': [text_file.name], } info['type'] = get_link_type(info) yield info
def parse_json_export(json_file): """Parse JSON-format bookmarks export files (produced by pinboard.in/export/)""" json_file.seek(0) json_content = json.load(json_file) for line in json_content: # example line # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if line: erg = line time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ') info = { 'url': erg['href'], 'domain': domain(erg['href']), 'base_url': base_url(erg['href']), 'timestamp': str(time.timestamp()), 'tags': erg['tags'], 'title': erg['description'].replace(' — Readability', ''), 'sources': [json_file.name], } info['type'] = get_link_type(info) yield info
def fetch_favicon(link_dir, link, timeout=TIMEOUT): """download site favicon from google's favicon api""" if os.path.exists(os.path.join(link_dir, 'favicon.ico')): return {'output': 'favicon.ico', 'status': 'skipped'} CMD = [ CURL_BINARY, '--max-time', str(timeout), *(() if CHECK_SSL_VALIDITY else ('--insecure', )), 'https://www.google.com/s2/favicons?domain={}'.format( domain(link['url'])), ] fout = open('{}/favicon.ico'.format(link_dir), 'w') end = progress(timeout, prefix=' ') try: run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # favicon.ico fout.close() end() chmod_file('favicon.ico', cwd=link_dir) output = 'favicon.ico' except Exception as e: fout.close() end() print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) print(' Run to see full output:') print(' {}'.format(' '.join(CMD))) output = e return { 'cmd': CMD, 'output': output, }
def parse_medium_rss_feed(rss_file): """Parse Medium RSS feed files into links""" rss_file.seek(0) root = etree.parse(rss_file).getroot() items = root.find("channel").findall("item") for item in items: # for child in item: # print(child.tag, child.text) url = item.find("link").text title = item.find("title").text ts_str = item.find("pubDate").text time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': "", 'title': title, 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def render(self, art): return '<a href="/art/%s">%s</a> (%s)' % (art.id, art.headline, util.domain(art.permalink))
def render(self, url): return util.domain(url)
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT): """download full site using wget""" domain_dir = os.path.join(link_dir, domain(link['url'])) existing_file = wget_output_path(link) if os.path.exists(domain_dir) and existing_file: return {'output': existing_file, 'status': 'skipped'} if warc: warc_dir = os.path.join(link_dir, 'warc') os.makedirs(warc_dir, exist_ok=True) warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html CMD = [ WGET_BINARY, # '--server-response', # print headers for better error parsing '--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off', '--restrict-file-names=unix', '--timeout={}'.format(timeout), *(() if warc else ('--timestamping', )), *(('--warc-file={}'.format(warc_path), ) if warc else ()), *(('--page-requisites', ) if FETCH_WGET_REQUISITES else ()), *(('--user-agent={}'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else ()), *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))), link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) end() output = wget_output_path(link, look_in=domain_dir) output_tail = [ ' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip() ] # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" files_downloaded = (int(output_tail[-1].strip().split(' ', 2)[1] or 0) if 'Downloaded:' in output_tail[-1] else 0) # Check for common failure cases if result.returncode > 0 and files_downloaded < 1: print(' Got wget response code {}:'.format( result.returncode)) print('\n'.join(output_tail)) if b'403: Forbidden' in result.stderr: raise Exception('403 Forbidden (try changing WGET_USER_AGENT)') if b'404: Not Found' in result.stderr: raise Exception('404 Not Found') if b'ERROR 500: Internal Server Error' in result.stderr: raise Exception('500 Internal Server Error') raise Exception('Got an error from the server') except Exception as e: end() output = e print_error_hints(cmd=CMD, pwd=link_dir, err=e) return { 'cmd': CMD, 'output': output, }