def write_html_link_index(out_dir, link): check_link_structure(link) with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: link_html = f.read() path = os.path.join(out_dir, 'index.html') link = derived_link_info(link) with open(path, 'w', encoding='utf-8') as f: f.write(Template(link_html).substitute({ **link, 'title': ( link['title'] or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) ), 'archive_url': urlencode( wget_output_path(link) or (link['domain'] if link['is_archived'] else 'about:blank') ), 'extension': link['extension'] or 'html', 'tags': link['tags'].strip() or 'untagged', 'status': 'Archived' if link['is_archived'] else 'Not yet archived', 'status_color': 'success' if link['is_archived'] else 'danger', })) chmod_file(path)
def write_html_link_index(out_dir, link): check_link_structure(link) with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: link_html = f.read() path = os.path.join(out_dir, 'index.html') print(' √ index.html') link = derived_link_info(link) with open(path, 'w', encoding='utf-8') as f: f.write(Template(link_html).substitute({ **link, 'title': ( link['title'] or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) ), 'archive_url': ( wget_output_path(link) or (link['domain'] if link['is_archived'] else 'about:blank') ), })) chmod_file(path)
def parse_json_link_index(out_dir): """load the json link index from a given directory""" existing_index = os.path.join(out_dir, 'index.json') if os.path.exists(existing_index): with open(existing_index, 'r', encoding='utf-8') as f: link_json = json.load(f) check_link_structure(link_json) return link_json return {}
def write_json_link_index(out_dir, link): """write a json file with some info about the link""" check_link_structure(link) path = os.path.join(out_dir, 'index.json') with open(path, 'w', encoding='utf-8') as f: json.dump(link, f, indent=4, default=str) chmod_file(path)
def parse_json_links_index(out_dir=OUTPUT_DIR): """parse an archive index json file and return the list of links""" index_path = os.path.join(out_dir. 'index.json') if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: links = json.load(f)['links'] check_link_structure(links) return links return []
def archive_link(link_dir, link, overwrite=True): """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" check_link_structure(link) try: update_existing = os.path.exists(link_dir) if update_existing: link = { **parse_json_link_index(link_dir), **link, } else: os.makedirs(link_dir) print_link_status_line(link_dir, link, update_existing) if FETCH_FAVICON: link = fetch_favicon(link_dir, link, overwrite=overwrite) if FETCH_TITLE: link = fetch_title(link_dir, link, overwrite=overwrite) if FETCH_WGET: link = fetch_wget(link_dir, link, overwrite=overwrite) if FETCH_PDF: link = fetch_pdf(link_dir, link, overwrite=overwrite) if FETCH_SCREENSHOT: link = fetch_screenshot(link_dir, link, overwrite=overwrite) if FETCH_DOM: link = fetch_dom(link_dir, link, overwrite=overwrite) if SUBMIT_ARCHIVE_DOT_ORG: link = archive_dot_org(link_dir, link, overwrite=overwrite) if FETCH_GIT: link = fetch_git(link_dir, link, overwrite=overwrite) if FETCH_MEDIA: link = fetch_media(link_dir, link, overwrite=overwrite) write_link_index(link_dir, link) except Exception as err: print(' ! Failed to archive link: {}: {}'.format( err.__class__.__name__, err)) return link
def load_json_link_index(out_dir, link): """check for an existing link archive in the given directory, and load+merge it into the given link dict """ link = { **parse_json_link_index(out_dir), **link, } link.update({ 'history': link.get('history') or {}, }) check_link_structure(link) return link
def write_html_link_index(out_dir, link): check_link_structure(link) with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: link_html = f.read() path = os.path.join(out_dir, 'index.html') print(' √ index.html') with open(path, 'w', encoding='utf-8') as f: f.write(Template(link_html).substitute({ **derived_link_info(link), # **link['latest'], })) chmod_file(path)
def load_link_index(link_dir, link): """check for an existing link archive in the given directory, and load+merge it into the given link dict """ is_new = not os.path.exists(link_dir) if is_new: os.makedirs(link_dir) else: link = { **parse_json_link_index(link_dir), **link, } check_link_structure(link) print_link_status_line(link_dir, link, is_new) return link
def validate_links(links): check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links( links) # deterministically sort the links based on timstamp, url if not links: print('[X] No links found :(') raise SystemExit(1) for link in links: link['title'] = unescape( link['title'].strip()) if link['title'] else None check_link_structure(link) return list(links)
def write_html_link_index(out_dir, link): check_link_structure(link) with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'), 'r', encoding='utf-8') as f: link_html = f.read() path = os.path.join(out_dir, 'index.html') print(' √ index.html') with open(path, 'w', encoding='utf-8') as f: f.write( Template(link_html).substitute({ **link, **link['latest'], 'title': link['title'] or link['url'], 'type': link['type'] or 'website', 'tags': link['tags'] or 'untagged', 'bookmarked': datetime.fromtimestamp(float( link['timestamp'])).strftime('%Y-%m-%d %H:%M'), 'updated': datetime.fromtimestamp(float( link['updated'])).strftime('%Y-%m-%d %H:%M'), 'bookmarked_ts': link['timestamp'], 'updated_ts': link['updated'], 'archive_org': link['latest'].get('archive_org') or 'https://web.archive.org/save/{}'.format(link['url']), 'wget': link['latest'].get('wget') or wget_output_path(link), })) chmod_file(path)
def validate_links(links): check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links( links) # deterministically sort the links based on timstamp, url if not links: print('[X] No links found :(') raise SystemExit(1) for link in links: check_link_structure(link) link['title'] = unescape(link['title']) if link['title'] else None link['latest'] = link.get('latest') or {} latest = link['latest'] if not link['latest'].get('wget'): link['latest']['wget'] = wget_output_path(link) if not link['latest'].get('pdf'): link['latest']['pdf'] = None if not link['latest'].get('screenshot'): link['latest']['screenshot'] = None if not link['latest'].get('dom'): link['latest']['dom'] = None if not latest.get('favicon'): latest['favicon'] = None if not link['latest'].get('title'): link['latest']['title'] = link['title'] return list(links)