def parse_pocket_export(html_file): """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" html_file.seek(0) pattern = re.compile( "^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE) for line in html_file: # example line # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li> match = pattern.search(line) if match: fixed_url = match.group(1).replace( 'http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url time = datetime.fromtimestamp(float(match.group(2))) info = { 'url': fixed_url, 'domain': domain(fixed_url), 'base_url': base_url(fixed_url), 'timestamp': str(time.timestamp()), 'tags': match.group(3), 'title': match.group(4).replace(' — Readability', '').replace( 'http://www.readability.com/read?url=', '') or base_url(fixed_url), 'sources': [html_file.name], } info['type'] = get_link_type(info) yield info
def parse_bookmarks_export(html_file): """Parse netscape-format bookmarks export files (produced by all browsers)""" html_file.seek(0) pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE) for line in html_file: # example line # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A> match = pattern.search(line) if match: url = match.group(1) time = datetime.fromtimestamp(float(match.group(2))) info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': "", 'title': match.group(3), 'sources': [html_file.name], } info['type'] = get_link_type(info) yield info
def parse_pinboard_rss_feed(rss_file): """Parse Pinboard RSS feed files into links""" rss_file.seek(0) root = etree.parse(rss_file).getroot() items = root.findall("{http://purl.org/rss/1.0/}item") for item in items: url = item.find("{http://purl.org/rss/1.0/}link").text tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text title = item.find("{http://purl.org/rss/1.0/}title").text ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text # = 🌈🌈🌈🌈 # = 🌈🌈🌈🌈 # = 🏆🏆🏆🏆 # Pinboard includes a colon in its date stamp timezone offsets, which # Python can't parse. Remove it: if ":" == ts_str[-3:-2]: ts_str = ts_str[:-3] + ts_str[-2:] time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': tags, 'title': title, 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def test_base_url(self): for expected, url in ( ('http://site/', 'http://site/'), ('http://site/path/', 'http://site/path/'), ('http://site/path/', 'http://site/path/leaf'), ('http://site/path/', 'http://site/path/leaf?query#frag'), ): self.assertEquals(expected, util.base_url(url))
def parse_json_export(json_file): """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" json_file.seek(0) json_content = json.load(json_file) for line in json_content: # example line # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if line: erg = line if erg.get('timestamp'): timestamp = str( erg['timestamp'] / 10000000 ) # chrome/ff histories use a very precise timestamp elif erg.get('time'): timestamp = str( datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp()) elif erg.get('created_at'): timestamp = str( datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp()) else: timestamp = str(datetime.now().timestamp()) if erg.get('href'): url = erg['href'] else: url = erg['url'] if erg.get('description'): title = (erg.get('description') or '').replace(' — Readability', '') else: title = erg.get('title') or base_url(url) info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': timestamp, 'tags': erg.get('tags') or '', 'title': title, 'sources': [json_file.name], } info['type'] = get_link_type(info) yield info
def basic_link_info(url, f, title=None, time=datetime.now(), tags=""): info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': tags, 'title': title, 'sources': [f.name], } info['type'] = get_link_type(info) return info
def parse_atom_export(rss_file): """Parse Atom XML-format files into links""" rss_file.seek(0) rss_data = rss_file.read() d = feedparser.parse(rss_data) for item in d.entries: info = { 'url': item.link, 'domain': domain(item.link), 'base_url': base_url(item.link), 'timestamp': str(mktime(item.published_parsed)), 'tags': '', 'title': item.title, 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_shaarli_rss_export(rss_file): """Parse Shaarli-specific RSS XML-format files into links""" rss_file.seek(0) entries = rss_file.read().split('<entry>')[1:] for entry in entries: # example entry: # <entry> # <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title> # <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" /> # <id>https://demo.shaarli.org/?cEV4vw</id> # <published>2019-01-30T06:06:01+00:00</published> # <updated>2019-01-30T06:06:01+00:00</updated> # <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>— <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content> # </entry> trailing_removed = entry.split('</entry>', 1)[0] leading_removed = trailing_removed.strip() rows = leading_removed.split('\n') def get_row(key): return [ r.strip() for r in rows if r.strip().startswith('<{}'.format(key)) ][0] title = str_between(get_row('title'), '<title>', '</title>').strip() url = str_between(get_row('link'), '<link href="', '" />') ts_str = str_between(get_row('published'), '<published>', '</published>') time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or fetch_page_title(url), 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_rss_export(rss_file): """Parse RSS XML-format files into links""" rss_file.seek(0) items = rss_file.read().split('</item>\n<item>') for item in items: # example item: # <item> # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title> # <category>Unread</category> # <link>https://blog.sessionstack.com/how-javascript-works-inside</link> # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid> # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate> # </item> trailing_removed = item.split('</item>', 1)[0] leading_removed = trailing_removed.split('<item>', 1)[-1] rows = leading_removed.split('\n') def get_row(key): return [ r for r in rows if r.strip().startswith('<{}>'.format(key)) ][0] title = str_between(get_row('title'), '<![CDATA[', ']]').strip() url = str_between(get_row('link'), '<link>', '</link>') ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>') time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or fetch_page_title(url), 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_plain_text(text_file): """Parse raw links from each line in a text file""" text_file.seek(0) text_content = text_file.readlines() for line in text_content: if line: urls = re.findall(URL_REGEX, line) for url in urls: info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(datetime.now().timestamp()), 'tags': '', 'title': fetch_page_title(url), 'sources': [text_file.name], } info['type'] = get_link_type(info) yield info
def parse_json_export(json_file): """Parse JSON-format bookmarks export files (produced by pinboard.in/export/)""" json_file.seek(0) json_content = json.load(json_file) for line in json_content: # example line # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if line: erg = line time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ') info = { 'url': erg['href'], 'domain': domain(erg['href']), 'base_url': base_url(erg['href']), 'timestamp': str(time.timestamp()), 'tags': erg['tags'], 'title': erg['description'].replace(' — Readability', ''), 'sources': [json_file.name], } info['type'] = get_link_type(info) yield info
def parse_medium_rss_feed(rss_file): """Parse Medium RSS feed files into links""" rss_file.seek(0) root = etree.parse(rss_file).getroot() items = root.find("channel").findall("item") for item in items: # for child in item: # print(child.tag, child.text) url = item.find("link").text title = item.find("title").text ts_str = item.find("pubDate").text time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': "", 'title': title, 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
""" Start """ import os, json, socket, sys, util path = os.path.split(os.path.realpath(__file__))[0]; os.chdir(path); sys.path.insert(0, path) os.environ['DEVEL']='yes' os.environ['PGHOST']=os.path.join(path, 'postgres_data/socket') util.chdir() ports = util.get_ports() base_url = util.base_url(ports['hub-share-2']) print('''\n\nBASE URL: {}\n\n'''.format(base_url)) share_path= os.path.join(os.environ['SMC_ROOT'], 'data/projects/[project_id]') cmd = "cd ../../ && . smc-env && service_hub.py --share_path={share_path} --foreground --hostname=0.0.0.0 --port=0 --share_port={share_port} --proxy_port=0 --gap=0 --base_url={base_url} start".format( base_url = base_url, share_port = ports['hub-share-2'], share_path = share_path) util.cmd(cmd)
#!/usr/bin/env python import os, json, socket, sys, util path = os.path.split(os.path.realpath(__file__))[0] os.chdir(path) sys.path.insert(0, path) os.environ['DEVEL'] = 'yes' util.chdir() ports = util.get_ports() base_url = util.base_url() hostname = socket.gethostname() cmd = "service_hub.py --dev --foreground --db=localhost:{db_port} --db_concurrent_warn=100 --db_pool=10 --hostname={hostname} --port={hub_port} --proxy_port=0 --gap=0 --base_url={base_url} start".format( hostname=hostname, base_url=base_url, db_port=ports['rethinkdb'], hub_port=ports['hub']) util.cmd(cmd)
#!/usr/bin/env python import time, util while True: try: print "Visit https://cloud.sagemath.com" + util.base_url() + '/\n' except Exception, mesg: print mesg print "waiting..." time.sleep(15)
#!/usr/bin/env python import os, json, socket, sys, util path = os.path.split(os.path.realpath(__file__))[0]; os.chdir(path); sys.path.insert(0, path) os.environ['DEVEL']='yes' os.environ['PGHOST']=os.path.join(path, 'postgres_data/socket') if 'TMUX' in os.environ: # see https://github.com/sagemathinc/cocalc/issues/563 del os.environ['TMUX'] util.chdir() ports = util.get_ports() base_url = util.base_url() hostname = 'localhost' cmd = "service_hub.py --dev --foreground --hostname={hostname} --port={hub_port} --proxy_port=0 --gap=0 --base_url={base_url} start".format( hostname = hostname, base_url = base_url, hub_port = ports['hub']) util.cmd(cmd)
""" Start """ import os, json, socket, sys, util path = os.path.split(os.path.realpath(__file__))[0]; os.chdir(path); sys.path.insert(0, path) os.environ['DEVEL']='yes' os.environ['PGHOST']=os.path.join(path, 'postgres_data/socket') util.chdir() ports = util.get_ports() base_url = util.base_url(ports['hub-share-2'], write=False) print('''\n\nBASE URL: {}\n\n'''.format(base_url)) if 'COCALC_PROJECT_PATH' in os.environ: share_path = os.environ['COCALC_PROJECT_PATH'] + '[project_id]' else: share_path= os.path.join(os.environ['SMC_ROOT'], 'data/projects/[project_id]') cmd = "unset NODE_ENV; cd ../../ && . smc-env && service_hub.py --share_path={share_path} --foreground --hostname=0.0.0.0 --port=0 --share_port={share_port} --proxy_port=0 --gap=0 --base_url={base_url} {test} start".format( base_url = base_url, share_port = ports['hub-share-2'], share_path = share_path, test=util.test()) util.cmd(cmd)
#!/usr/bin/env python import os, sys, time path = os.path.split(os.path.realpath(__file__))[0] os.chdir(path) sys.path.insert(0, path) import util while True: try: print "Visit https://cocalc.com" + util.base_url() + '/\n' except Exception, mesg: print mesg print "waiting..." time.sleep(15)
#!/usr/bin/env python """ Start """ import os, json, socket, sys, util path = os.path.split(os.path.realpath(__file__))[0] os.chdir(path) sys.path.insert(0, path) os.environ['DEVEL'] = 'yes' os.environ['PGHOST'] = os.path.join(path, 'postgres_data/socket') util.chdir() ports = util.get_ports() base_url = util.base_url(ports['hub-share-2'], write=False) print('''\n\nBASE URL: {}\n\n'''.format(base_url)) share_path = os.path.join(os.environ['SMC_ROOT'], 'data/projects/[project_id]') cmd = "unset NODE_ENV; cd ../../ && . smc-env && service_hub.py --share_path={share_path} --foreground --hostname=0.0.0.0 --port=0 --share_port={share_port} --proxy_port=0 --gap=0 --base_url={base_url} start".format( base_url=base_url, share_port=ports['hub-share-2'], share_path=share_path) util.cmd(cmd)
#!/usr/bin/env python import time, util while True: try: print "Visit https://cloud.sagemath.com" + util.base_url( ) + '/\n' except Exception, mesg: print mesg print "waiting..." time.sleep(15)
#!/usr/bin/env python import util print "Point your browser at\n\n https://cloud.sagemath.com" + util.base_url() + '\n\n'