コード例 #1
0
ファイル: index.py プロジェクト: yyniu/ArchiveBox
def parse_json_links_index(out_dir):
    """load the index in a given directory and merge it with the given link"""
    index_path = os.path.join(out_dir, 'index.json')
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
            links = json.load(f)['links']
            check_links_structure(links)
            return links

    return []
コード例 #2
0
ファイル: index.py プロジェクト: XIONGJUNHAN/ArchiveBox
def parse_json_links_index(out_dir=OUTPUT_DIR):
    """parse a archive index json file and return the list of links"""
    index_path = os.path.join(out_dir, 'index.json')
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
            links = json.load(f)['links']
            check_links_structure(links)
            return links

    return []
コード例 #3
0
ファイル: index.py プロジェクト: XIONGJUNHAN/ArchiveBox
def write_links_index(out_dir, links, finished=False):
    """create index.html file for a given list of links"""

    log_indexing_started()
    check_links_structure(links)

    write_json_links_index(out_dir, links)
    log_indexing_finished(out_dir, 'index.json')
    
    write_html_links_index(out_dir, links, finished=finished)
    log_indexing_finished(out_dir, 'index.html')
コード例 #4
0
ファイル: index.py プロジェクト: yokumaz/ArchiveBox
def write_html_links_index(out_dir, links, finished=False):
    """write the html link index to a given path"""

    check_links_structure(links)

    path = os.path.join(out_dir, 'index.html')

    copy_tree(os.path.join(TEMPLATES_DIR, 'static'),
              os.path.join(out_dir, 'static'))

    with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f:
        f.write('User-agent: *\nDisallow: /')

    with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r',
              encoding='utf-8') as f:
        index_html = f.read()

    with open(os.path.join(TEMPLATES_DIR, 'index_row.html'),
              'r',
              encoding='utf-8') as f:
        link_row_html = f.read()

    full_links_info = (derived_link_info(link) for link in links)

    link_rows = '\n'.join(
        Template(link_row_html).substitute(
            **{
                **link,
                'title': (
                    link['title'] or (link['base_url'] if link['is_archived']
                                      else TITLE_LOADING_MSG)),
                'favicon_url': (
                    os.path.join('archive', link['timestamp'], 'favicon.ico')
                    # if link['is_archived'] else ''
                ),
                'archive_url':
                urlencode(wget_output_path(link) or 'index.html'),
            }) for link in full_links_info)

    template_vars = {
        'num_links': len(links),
        'date_updated': datetime.now().strftime('%Y-%m-%d'),
        'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
        'footer_info': FOOTER_INFO,
        'git_sha': GIT_SHA,
        'short_git_sha': GIT_SHA[:8],
        'rows': link_rows,
        'status': 'finished' if finished else 'running',
    }

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(index_html).substitute(**template_vars))

    chmod_file(path)
コード例 #5
0
ファイル: index.py プロジェクト: yyniu/ArchiveBox
def write_links_index(out_dir, links):
    """create index.html file for a given list of links"""

    check_links_structure(links)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    print('{green}[*] [{}] Updating main index files...{reset}'.format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        **ANSI,
    ))
    write_json_links_index(out_dir, links)
    print('    > {}/index.json'.format(pretty_path(out_dir)))

    write_html_links_index(out_dir, links)
    print('    > {}/index.html'.format(pretty_path(out_dir)))
コード例 #6
0
def validate_links(links):
    check_links_structure(links)
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    links = sorted_links(
        links)  # deterministically sort the links based on timstamp, url

    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)

    for link in links:
        link['title'] = unescape(
            link['title'].strip()) if link['title'] else None
        check_link_structure(link)

    return list(links)
コード例 #7
0
ファイル: index.py プロジェクト: XIONGJUNHAN/ArchiveBox
def write_json_links_index(out_dir, links):
    """write the json link index to a given path"""

    check_links_structure(links)

    path = os.path.join(out_dir, 'index.json')

    index_json = {
        'info': 'ArchiveBox Index',
        'help': 'https://github.com/pirate/ArchiveBox',
        'version': GIT_SHA,
        'num_links': len(links),
        'updated': str(datetime.now().timestamp()),
        'links': links,
    }

    with open(path, 'w', encoding='utf-8') as f:
        json.dump(index_json, f, indent=4, default=str)

    chmod_file(path)
コード例 #8
0
ファイル: index.py プロジェクト: maziesmith/python-resources
def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
    """parse and load existing index with any new links form import_path merged in"""
    
    existing_links = []
    if out_dir:
        existing_links = parse_json_links_index(out_dir)
        check_links_structure(existing_link)

    new_links = []
    if import_path: 
        # parse and validate import file 
        # this serves as a logging function
        log_parsing_started(import_path)
        raw_links, parser_name = parse_links(import_path)
        new_links = validate_links(raw_links)
        check_links_structure(new_links)

    # merge existing links in out_dir and new links
    all_links = validate_links(existing_links + new_links)
    
    return all_links, new_links
コード例 #9
0
ファイル: index.py プロジェクト: XIONGJUNHAN/ArchiveBox
def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
    """parse and load existing index with any new links from import_path merged in"""

    existing_links = []
    if out_dir:
        existing_links = parse_json_links_index(out_dir)
        check_links_structure(existing_links)

    new_links = []
    if import_path:
        # parse and validate the import file
        log_parsing_started(import_path)
        raw_links, parser_name = parse_links(import_path)
        new_links = validate_links(raw_links)
        check_links_structure(new_links)

    # merge existing links in out_dir and new links
    all_links = validate_links(existing_links + new_links)
    check_links_structure(all_links)
    num_new_links = len(all_links) - len(existing_links)

    if import_path and parser_name:
        log_parsing_finished(num_new_links, parser_name)

    return all_links, new_links
コード例 #10
0
ファイル: index.py プロジェクト: yyniu/ArchiveBox
def write_html_links_index(out_dir, links):
    """write the html link index to a given path"""

    check_links_structure(links)

    path = os.path.join(out_dir, 'index.html')

    copy_tree(os.path.join(TEMPLATES_DIR, 'static'),
              os.path.join(out_dir, 'static'))

    with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f:
        f.write('User-agent: *\nDisallow: /')

    with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r',
              encoding='utf-8') as f:
        index_html = f.read()

    with open(os.path.join(TEMPLATES_DIR, 'index_row.html'),
              'r',
              encoding='utf-8') as f:
        link_row_html = f.read()

    link_rows = '\n'.join(
        Template(link_row_html).substitute(**derived_link_info(link))
        for link in links)

    template_vars = {
        'num_links': len(links),
        'date_updated': datetime.now().strftime('%Y-%m-%d'),
        'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
        'footer_info': FOOTER_INFO,
        'git_sha': GIT_SHA,
        'short_git_sha': GIT_SHA[:8],
        'rows': link_rows,
    }

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(index_html).substitute(**template_vars))

    chmod_file(path)
コード例 #11
0
ファイル: archive.py プロジェクト: yyniu/ArchiveBox
def load_links(archive_path=OUTPUT_DIR, import_path=None):
    """get new links from file and optionally append them to links in existing archive"""

    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
        check_links_structure(existing_links)

    new_links = []
    if import_path:
        # parse and validate the import file
        raw_links, parser_name = parse_links(import_path)
        new_links = validate_links(raw_links)
        check_links_structure(new_links)

    # merge existing links in archive_path and new links
    all_links = validate_links(existing_links + new_links)
    check_links_structure(all_links)
    num_new_links = len(all_links) - len(existing_links)

    if import_path and parser_name:
        print(
            '    > Adding {} new links to index (parsed import as {})'.format(
                num_new_links,
                parser_name,
            ))

    return all_links, new_links
コード例 #12
0
ファイル: links.py プロジェクト: yyniu/ArchiveBox
def validate_links(links):
    check_links_structure(links)
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    links = sorted_links(
        links)  # deterministically sort the links based on timstamp, url

    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)

    for link in links:
        check_link_structure(link)

        link['title'] = unescape(link['title']) if link['title'] else None
        link['latest'] = link.get('latest') or {}

        latest = link['latest']
        if not link['latest'].get('wget'):
            link['latest']['wget'] = wget_output_path(link)

        if not link['latest'].get('pdf'):
            link['latest']['pdf'] = None

        if not link['latest'].get('screenshot'):
            link['latest']['screenshot'] = None

        if not link['latest'].get('dom'):
            link['latest']['dom'] = None

        if not latest.get('favicon'):
            latest['favicon'] = None

        if not link['latest'].get('title'):
            link['latest']['title'] = link['title']

    return list(links)
コード例 #13
0
ファイル: archive.py プロジェクト: yyniu/ArchiveBox
def update_archive(archive_path, links, source=None, resume=None, append=True):
    """update or create index.html+json given a path to an export file containing new links"""

    start_ts = datetime.now().timestamp()

    if resume:
        print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.
              format(
                  datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                  resume,
                  **ANSI,
              ))
    else:
        print(
            '{green}[▶] [{}] Updating content for {} pages in archive...{reset}'
            .format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                len(links),
                **ANSI,
            ))

    check_links_structure(links)

    # prefetch the first link off the generator so that if we pause or fail
    # immediately we can show that we paused on the first link and not just None
    to_archive = Peekable(links_after_timestamp(links, resume))
    idx, link = 0, to_archive.peek(0)

    # loop over links and archive them
    try:
        check_dependencies()
        for idx, link in enumerate(to_archive):
            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)

    except (KeyboardInterrupt, SystemExit, Exception) as e:
        print(
            '\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'
            .format(
                **ANSI,
                now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                idx=idx + 1,
                timestamp=link['timestamp'],
                total=len(links),
            ))
        print('    To view your archive, open: {}/index.html'.format(
            OUTPUT_DIR.replace(REPO_DIR + '/', '')))
        print('    Continue where you left off by running:')
        print('        {} {}'.format(
            pretty_path(sys.argv[0]),
            link['timestamp'],
        ))
        if not isinstance(e, KeyboardInterrupt):
            print()
            raise e
        raise SystemExit(1)

    # print timing information & summary
    end_ts = datetime.now().timestamp()
    seconds = end_ts - start_ts
    if seconds > 60:
        duration = '{0:.2f} min'.format(seconds / 60, 2)
    else:
        duration = '{0:.2f} sec'.format(seconds, 2)

    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
        ANSI['green'],
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        len(links),
        duration,
        ANSI['reset'],
    ))
    print('    - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
    print('    - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
    print('    - {} errors'.format(_RESULTS_TOTALS['failed']))
    print('    To view your archive, open: {}/index.html'.format(
        OUTPUT_DIR.replace(REPO_DIR + '/', '')))