Esempio n. 1
0
def update_archive_data(import_path=None, resume=None):
    """The main ArchiveBox entrancepoint.  Everything starts here."""
    check_dependencies()

    # Step 1: Load list of links from the existing index
    #         merge in and dedupe new links from import_path
    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR,
                                            import_path=import_path)

    # Step 2: Write updated index with deduped old and new links back to disk
    write_links_index(out_dir=OUTPUT_DIR, links=all_links)

    # Step 3: Run the archive methods for each link
    links = new_links if ONLY_NEW else all_links
    log_archiving_started(len(links), resume)
    idx, link = 0, 0
    try:
        for idx, link in enumerate(links_after_timestamp(links, resume)):
            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)

    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link and link['timestamp'])
        raise SystemExit(0)

    except:
        print()
        raise

    log_archiving_finished(len(links))

    # Step 4: Re-write links index with updated titles, icons, and resources
    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
    write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
Esempio n. 2
0
def update_archive_data(import_path=None, resume=None):
    """Main entrance point
    """
    # step 1: load list of links form existing index
    # merge in and dedupe new links from import path
    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR,
                                            import_path=import_path)

    # step 2: write updated index with deduped old and new links back to disk
    write_links_index(out_dir=OUTPUT_DIR, links=all_links)

    # step 3: run the archive methods for each link
    links = new_links if ONLY_NEW else all_links
    log_archiving_started(len(links), resume)

    # important note here - iterate through (if keyboard interrupt print out the stop)
    idx, link = 0, 0
    try:
        for idx, link in enumerate(links_after_timestamp(links, resume)):
            links_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)

    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link and link['timestamp'])
        raise SystemExit(0)

    except:
        print()
        raise

    log_archiving_finished(len(links))

    # Step 4: Re-write links index with updated titles, icons, and resources
    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
    write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
Esempio n. 3
0
def y_main(url):

    ### Handle CLI arguments
    #     ./archive bookmarks.html
    #     ./archive 1523422111.234
    resume = None
    ### Set up output folder
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    ### Handle ingesting urls from a remote file/feed
    # (e.g. if an RSS feed URL is used as the import path)
    if url and any(
            url.startswith(s) for s in ('http://', 'https://', 'ftp://')):
        import_path = save_remote_source(url)

    ### Run the main archive update process
    """The main ArchiveBox entrancepoint. Everything starts here."""

    links = [{
        'url': url,
        'timestamp': str(datetime.now().timestamp()),
        'title': None,
        'tags': '',
        'sources': [import_path]
    }]
    log_archiving_started(len(links), resume)
    idx, link = 0, 0
    try:
        for idx, link in enumerate(links_after_timestamp(links, resume)):
            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)

    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link and link['timestamp'])
        raise SystemExit(0)

    except:
        print()
        raise

    log_archiving_finished(len(links))

    # Step 4: Re-write links index with updated titles, icons, and resources
    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
    write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
Esempio n. 4
0
def update_archive(archive_path, links, source=None, resume=None, append=True):
    """update or create index.html+json given a path to an export file containing new links"""

    start_ts = datetime.now().timestamp()

    if resume:
        print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.
              format(
                  datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                  resume,
                  **ANSI,
              ))
    else:
        print(
            '{green}[▶] [{}] Updating content for {} pages in archive...{reset}'
            .format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                len(links),
                **ANSI,
            ))

    check_links_structure(links)

    # prefetch the first link off the generator so that if we pause or fail
    # immediately we can show that we paused on the first link and not just None
    to_archive = Peekable(links_after_timestamp(links, resume))
    idx, link = 0, to_archive.peek(0)

    # loop over links and archive them
    try:
        check_dependencies()
        for idx, link in enumerate(to_archive):
            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)

    except (KeyboardInterrupt, SystemExit, Exception) as e:
        print(
            '\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'
            .format(
                **ANSI,
                now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                idx=idx + 1,
                timestamp=link['timestamp'],
                total=len(links),
            ))
        print('    To view your archive, open: {}/index.html'.format(
            OUTPUT_DIR.replace(REPO_DIR + '/', '')))
        print('    Continue where you left off by running:')
        print('        {} {}'.format(
            pretty_path(sys.argv[0]),
            link['timestamp'],
        ))
        if not isinstance(e, KeyboardInterrupt):
            print()
            raise e
        raise SystemExit(1)

    # print timing information & summary
    end_ts = datetime.now().timestamp()
    seconds = end_ts - start_ts
    if seconds > 60:
        duration = '{0:.2f} min'.format(seconds / 60, 2)
    else:
        duration = '{0:.2f} sec'.format(seconds, 2)

    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
        ANSI['green'],
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        len(links),
        duration,
        ANSI['reset'],
    ))
    print('    - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
    print('    - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
    print('    - {} errors'.format(_RESULTS_TOTALS['failed']))
    print('    To view your archive, open: {}/index.html'.format(
        OUTPUT_DIR.replace(REPO_DIR + '/', '')))