def archive_links(archive_path, links, source=None, resume=None): check_dependencies() to_archive = Peekable(links_after_timestamp(links, resume)) idx, link = 0, to_archive.peek(0) try: for idx, link in enumerate(to_archive): link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except (KeyboardInterrupt, SystemExit, Exception) as e: print( '{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}' .format( **ANSI, now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), idx=idx + 1, timestamp=link['timestamp'], total=len(links), )) print(' Continue where you left off by running:') print(' {} {}'.format( pretty_path(sys.argv[0]), link['timestamp'], )) if not isinstance(e, KeyboardInterrupt): raise e raise SystemExit(1)
def update_archive_data(import_path=None, resume=None): """The main ArchiveBox entrancepoint. Everything starts here.""" check_dependencies() # Step 1: Load list of links from the existing index # merge in and dedupe new links from import_path all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) # Step 2: Write updated index with deduped old and new links back to disk write_links_index(out_dir=OUTPUT_DIR, links=all_links) # Step 3: Run the archive methods for each link links = new_links if ONLY_NEW else all_links log_archiving_started(len(links), resume) idx, link = 0, 0 try: for idx, link in enumerate(links_after_timestamp(links, resume)): link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except KeyboardInterrupt: log_archiving_paused(len(links), idx, link and link['timestamp']) raise SystemExit(0) except: print() raise log_archiving_finished(len(links)) # Step 4: Re-write links index with updated titles, icons, and resources all_links, _ = load_links_index(out_dir=OUTPUT_DIR) write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
def archive_links(archive_path, links, source=None, resume=None): check_dependencies() to_archive = links_after_timestamp(links, resume) try: for idx, link in enumerate(to_archive): link_dir = os.path.join(archive_path, link['timestamp']) archive_link(link_dir, link) except (KeyboardInterrupt, SystemExit, Exception) as e: print( '{red}[X] Index is up-to-date, archive update paused on link {idx}/{total}{reset}' .format( **ANSI, idx=idx, total=len(list(to_archive)), )) print(' Continue where you left off by running:') print(' ./archive.py {} {}'.format( source, link['timestamp'], )) if not isinstance(e, KeyboardInterrupt): raise e raise SystemExit(1)
def update_archive_data(import_path=None, resume=None): """Main entrance point """ # step 1: load list of links form existing index # merge in and dedupe new links from import path all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) # step 2: write updated index with deduped old and new links back to disk write_links_index(out_dir=OUTPUT_DIR, links=all_links) # step 3: run the archive methods for each link links = new_links if ONLY_NEW else all_links log_archiving_started(len(links), resume) # important note here - iterate through (if keyboard interrupt print out the stop) idx, link = 0, 0 try: for idx, link in enumerate(links_after_timestamp(links, resume)): links_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except KeyboardInterrupt: log_archiving_paused(len(links), idx, link and link['timestamp']) raise SystemExit(0) except: print() raise log_archiving_finished(len(links)) # Step 4: Re-write links index with updated titles, icons, and resources all_links, _ = load_links_index(out_dir=OUTPUT_DIR) write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
def y_main(url): ### Handle CLI arguments # ./archive bookmarks.html # ./archive 1523422111.234 resume = None ### Set up output folder if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) ### Handle ingesting urls from a remote file/feed # (e.g. if an RSS feed URL is used as the import path) if url and any( url.startswith(s) for s in ('http://', 'https://', 'ftp://')): import_path = save_remote_source(url) ### Run the main archive update process """The main ArchiveBox entrancepoint. Everything starts here.""" links = [{ 'url': url, 'timestamp': str(datetime.now().timestamp()), 'title': None, 'tags': '', 'sources': [import_path] }] log_archiving_started(len(links), resume) idx, link = 0, 0 try: for idx, link in enumerate(links_after_timestamp(links, resume)): link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except KeyboardInterrupt: log_archiving_paused(len(links), idx, link and link['timestamp']) raise SystemExit(0) except: print() raise log_archiving_finished(len(links)) # Step 4: Re-write links index with updated titles, icons, and resources all_links, _ = load_links_index(out_dir=OUTPUT_DIR) write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
def update_archive(archive_path, links, source=None, resume=None, append=True): """update or create index.html+json given a path to an export file containing new links""" start_ts = datetime.now().timestamp() if resume: print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'. format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), resume, **ANSI, )) else: print( '{green}[▶] [{}] Updating content for {} pages in archive...{reset}' .format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), len(links), **ANSI, )) check_links_structure(links) # prefetch the first link off the generator so that if we pause or fail # immediately we can show that we paused on the first link and not just None to_archive = Peekable(links_after_timestamp(links, resume)) idx, link = 0, to_archive.peek(0) # loop over links and archive them try: check_dependencies() for idx, link in enumerate(to_archive): link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except (KeyboardInterrupt, SystemExit, Exception) as e: print( '\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}' .format( **ANSI, now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), idx=idx + 1, timestamp=link['timestamp'], total=len(links), )) print(' To view your archive, open: {}/index.html'.format( OUTPUT_DIR.replace(REPO_DIR + '/', ''))) print(' Continue where you left off by running:') print(' {} {}'.format( pretty_path(sys.argv[0]), link['timestamp'], )) if not isinstance(e, KeyboardInterrupt): print() raise e raise SystemExit(1) # print timing information & summary end_ts = datetime.now().timestamp() seconds = end_ts - start_ts if seconds > 60: duration = '{0:.2f} min'.format(seconds / 60, 2) else: duration = '{0:.2f} sec'.format(seconds, 2) print('{}[√] [{}] Update of {} pages complete ({}){}'.format( ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), len(links), duration, ANSI['reset'], )) print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped'])) print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded'])) print(' - {} errors'.format(_RESULTS_TOTALS['failed'])) print(' To view your archive, open: {}/index.html'.format( OUTPUT_DIR.replace(REPO_DIR + '/', '')))