def parse_json_links_index(out_dir): """load the index in a given directory and merge it with the given link""" index_path = os.path.join(out_dir, 'index.json') if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: links = json.load(f)['links'] check_links_structure(links) return links return []
def parse_json_links_index(out_dir=OUTPUT_DIR): """parse a archive index json file and return the list of links""" index_path = os.path.join(out_dir, 'index.json') if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: links = json.load(f)['links'] check_links_structure(links) return links return []
def write_links_index(out_dir, links, finished=False): """create index.html file for a given list of links""" log_indexing_started() check_links_structure(links) write_json_links_index(out_dir, links) log_indexing_finished(out_dir, 'index.json') write_html_links_index(out_dir, links, finished=finished) log_indexing_finished(out_dir, 'index.html')
def write_html_links_index(out_dir, links, finished=False): """write the html link index to a given path""" check_links_structure(links) path = os.path.join(out_dir, 'index.html') copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static')) with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f: f.write('User-agent: *\nDisallow: /') with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f: index_html = f.read() with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f: link_row_html = f.read() full_links_info = (derived_link_info(link) for link in links) link_rows = '\n'.join( Template(link_row_html).substitute( **{ **link, 'title': ( link['title'] or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)), 'favicon_url': ( os.path.join('archive', link['timestamp'], 'favicon.ico') # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs=' ), 'archive_url': urlencode(wget_output_path(link) or 'index.html'), }) for link in full_links_info) template_vars = { 'num_links': len(links), 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), 'footer_info': FOOTER_INFO, 'git_sha': GIT_SHA, 'short_git_sha': GIT_SHA[:8], 'rows': link_rows, 'status': 'finished' if finished else 'running', } with open(path, 'w', encoding='utf-8') as f: f.write(Template(index_html).substitute(**template_vars)) chmod_file(path)
def write_links_index(out_dir, links): """create index.html file for a given list of links""" check_links_structure(links) if not os.path.exists(out_dir): os.makedirs(out_dir) print('{green}[*] [{}] Updating main index files...{reset}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), **ANSI, )) write_json_links_index(out_dir, links) print(' > {}/index.json'.format(pretty_path(out_dir))) write_html_links_index(out_dir, links) print(' > {}/index.html'.format(pretty_path(out_dir)))
def validate_links(links): check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links( links) # deterministically sort the links based on timstamp, url if not links: print('[X] No links found :(') raise SystemExit(1) for link in links: link['title'] = unescape( link['title'].strip()) if link['title'] else None check_link_structure(link) return list(links)
def write_json_links_index(out_dir, links): """write the json link index to a given path""" check_links_structure(links) path = os.path.join(out_dir, 'index.json') index_json = { 'info': 'ArchiveBox Index', 'help': 'https://github.com/pirate/ArchiveBox', 'version': GIT_SHA, 'num_links': len(links), 'updated': str(datetime.now().timestamp()), 'links': links, } with open(path, 'w', encoding='utf-8') as f: json.dump(index_json, f, indent=4, default=str) chmod_file(path)
def load_links_index(out_dir=OUTPUT_DIR, import_path=None): """parse and load existing index with any new links form import_path merged in""" existing_links = [] if out_dir: existing_links = parse_json_links_index(out_dir) check_links_structure(existing_link) new_links = [] if import_path: # parse and validate import file # this serves as a logging function log_parsing_started(import_path) raw_links, parser_name = parse_links(import_path) new_links = validate_links(raw_links) check_links_structure(new_links) # merge existing links in out_dir and new links all_links = validate_links(existing_links + new_links) return all_links, new_links
def load_links_index(out_dir=OUTPUT_DIR, import_path=None): """parse and load existing index with any new links from import_path merged in""" existing_links = [] if out_dir: existing_links = parse_json_links_index(out_dir) check_links_structure(existing_links) new_links = [] if import_path: # parse and validate the import file log_parsing_started(import_path) raw_links, parser_name = parse_links(import_path) new_links = validate_links(raw_links) check_links_structure(new_links) # merge existing links in out_dir and new links all_links = validate_links(existing_links + new_links) check_links_structure(all_links) num_new_links = len(all_links) - len(existing_links) if import_path and parser_name: log_parsing_finished(num_new_links, parser_name) return all_links, new_links
def write_html_links_index(out_dir, links): """write the html link index to a given path""" check_links_structure(links) path = os.path.join(out_dir, 'index.html') copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static')) with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f: f.write('User-agent: *\nDisallow: /') with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f: index_html = f.read() with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f: link_row_html = f.read() link_rows = '\n'.join( Template(link_row_html).substitute(**derived_link_info(link)) for link in links) template_vars = { 'num_links': len(links), 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), 'footer_info': FOOTER_INFO, 'git_sha': GIT_SHA, 'short_git_sha': GIT_SHA[:8], 'rows': link_rows, } with open(path, 'w', encoding='utf-8') as f: f.write(Template(index_html).substitute(**template_vars)) chmod_file(path)
def load_links(archive_path=OUTPUT_DIR, import_path=None): """get new links from file and optionally append them to links in existing archive""" existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) check_links_structure(existing_links) new_links = [] if import_path: # parse and validate the import file raw_links, parser_name = parse_links(import_path) new_links = validate_links(raw_links) check_links_structure(new_links) # merge existing links in archive_path and new links all_links = validate_links(existing_links + new_links) check_links_structure(all_links) num_new_links = len(all_links) - len(existing_links) if import_path and parser_name: print( ' > Adding {} new links to index (parsed import as {})'.format( num_new_links, parser_name, )) return all_links, new_links
def validate_links(links): check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links( links) # deterministically sort the links based on timstamp, url if not links: print('[X] No links found :(') raise SystemExit(1) for link in links: check_link_structure(link) link['title'] = unescape(link['title']) if link['title'] else None link['latest'] = link.get('latest') or {} latest = link['latest'] if not link['latest'].get('wget'): link['latest']['wget'] = wget_output_path(link) if not link['latest'].get('pdf'): link['latest']['pdf'] = None if not link['latest'].get('screenshot'): link['latest']['screenshot'] = None if not link['latest'].get('dom'): link['latest']['dom'] = None if not latest.get('favicon'): latest['favicon'] = None if not link['latest'].get('title'): link['latest']['title'] = link['title'] return list(links)
def update_archive(archive_path, links, source=None, resume=None, append=True): """update or create index.html+json given a path to an export file containing new links""" start_ts = datetime.now().timestamp() if resume: print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'. format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), resume, **ANSI, )) else: print( '{green}[▶] [{}] Updating content for {} pages in archive...{reset}' .format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), len(links), **ANSI, )) check_links_structure(links) # prefetch the first link off the generator so that if we pause or fail # immediately we can show that we paused on the first link and not just None to_archive = Peekable(links_after_timestamp(links, resume)) idx, link = 0, to_archive.peek(0) # loop over links and archive them try: check_dependencies() for idx, link in enumerate(to_archive): link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except (KeyboardInterrupt, SystemExit, Exception) as e: print( '\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}' .format( **ANSI, now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), idx=idx + 1, timestamp=link['timestamp'], total=len(links), )) print(' To view your archive, open: {}/index.html'.format( OUTPUT_DIR.replace(REPO_DIR + '/', ''))) print(' Continue where you left off by running:') print(' {} {}'.format( pretty_path(sys.argv[0]), link['timestamp'], )) if not isinstance(e, KeyboardInterrupt): print() raise e raise SystemExit(1) # print timing information & summary end_ts = datetime.now().timestamp() seconds = end_ts - start_ts if seconds > 60: duration = '{0:.2f} min'.format(seconds / 60, 2) else: duration = '{0:.2f} sec'.format(seconds, 2) print('{}[√] [{}] Update of {} pages complete ({}){}'.format( ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), len(links), duration, ANSI['reset'], )) print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped'])) print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded'])) print(' - {} errors'.format(_RESULTS_TOTALS['failed'])) print(' To view your archive, open: {}/index.html'.format( OUTPUT_DIR.replace(REPO_DIR + '/', '')))