Esempio n. 1
0
def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    link = derived_link_info(link)

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **link,
            'title': (
                link['title']
                or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
            ),
            'archive_url': urlencode(
                wget_output_path(link)
                or (link['domain'] if link['is_archived'] else 'about:blank')
            ),
            'extension': link['extension'] or 'html',
            'tags': link['tags'].strip() or 'untagged',
            'status': 'Archived' if link['is_archived'] else 'Not yet archived',
            'status_color': 'success' if link['is_archived'] else 'danger',
        }))

    chmod_file(path)
Esempio n. 2
0
def validate_links(links):
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    links = sorted_links(
        links)  # deterministically sort the links based on timstamp, url

    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)

    for link in links:
        link['title'] = unescape(link['title'])
        link['latest'] = link.get('latest') or {}

        if not link['latest'].get('wget'):
            link['latest']['wget'] = wget_output_path(link)

        if not link['latest'].get('pdf'):
            link['latest']['pdf'] = None

        if not link['latest'].get('screenshot'):
            link['latest']['screenshot'] = None

        if not link['latest'].get('dom'):
            link['latest']['dom'] = None

    return list(links)
Esempio n. 3
0
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
    """print PDF of site to file using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}
    
    if os.path.exists(os.path.join(link_dir, 'output.pdf')):
        return {'output': 'output.pdf', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--print-to-pdf',
        '--hide-scrollbars',
        '--timeout={}'.format((timeout) * 1000),
        *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
        link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to print PDF')
        chmod_file('output.pdf', cwd=link_dir)
        output = 'output.pdf'
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)

    return {
        'cmd': CMD,
        'output': output,
    }
Esempio n. 4
0
def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    print('      √ index.html')

    link = derived_link_info(link)

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **link,
            'title': (
                link['title']
                or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
            ),
            'archive_url': (
                wget_output_path(link)
                or (link['domain'] if link['is_archived'] else 'about:blank')
            ),
        }))

    chmod_file(path)
Esempio n. 5
0
def write_html_links_index(out_dir, links, finished=False):
    """write the html link index to a given path"""

    check_links_structure(links)

    path = os.path.join(out_dir, 'index.html')

    copy_tree(os.path.join(TEMPLATES_DIR, 'static'),
              os.path.join(out_dir, 'static'))

    with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f:
        f.write('User-agent: *\nDisallow: /')

    with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r',
              encoding='utf-8') as f:
        index_html = f.read()

    with open(os.path.join(TEMPLATES_DIR, 'index_row.html'),
              'r',
              encoding='utf-8') as f:
        link_row_html = f.read()

    full_links_info = (derived_link_info(link) for link in links)

    link_rows = '\n'.join(
        Template(link_row_html).substitute(
            **{
                **link,
                'title': (
                    link['title'] or (link['base_url'] if link['is_archived']
                                      else TITLE_LOADING_MSG)),
                'favicon_url': (
                    os.path.join('archive', link['timestamp'], 'favicon.ico')
                    # if link['is_archived'] else ''
                ),
                'archive_url':
                urlencode(wget_output_path(link) or 'index.html'),
            }) for link in full_links_info)

    template_vars = {
        'num_links': len(links),
        'date_updated': datetime.now().strftime('%Y-%m-%d'),
        'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
        'footer_info': FOOTER_INFO,
        'git_sha': GIT_SHA,
        'short_git_sha': GIT_SHA[:8],
        'rows': link_rows,
        'status': 'finished' if finished else 'running',
    }

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(index_html).substitute(**template_vars))

    chmod_file(path)
Esempio n. 6
0
def fetch_screenshot(link_dir,
                     link,
                     timeout=TIMEOUT,
                     user_data_dir=CHROME_USER_DATA_DIR,
                     resolution=RESOLUTION):
    """take screenshot of site using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
        return {'output': 'screenshot.png', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--screenshot',
        '--window-size={}'.format(resolution),
        '--hide-scrollbars',
        '--timeout={}'.format((timeout) * 1000),
        *(() if CHECK_SSL_VALIDITY else
          ('--disable-web-security', '--ignore-certificate-errors')),
        # '--full-page',   # TODO: make this actually work using ./bin/screenshot fullPage: true
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to take screenshot')
        chmod_file('screenshot.png', cwd=link_dir)
        output = 'screenshot.png'
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)

    return {
        'cmd': CMD,
        'output': output,
    }
Esempio n. 7
0
def fetch_dom(link_dir,
              link,
              timeout=TIMEOUT,
              user_data_dir=CHROME_USER_DATA_DIR):
    """print HTML of site to file using chrome --dump-html"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    output_path = os.path.join(link_dir, 'output.html')

    if os.path.exists(output_path):
        return {'output': 'output.html', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir), '--dump-dom',
        '--timeout={}'.format((timeout) * 1000), link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        with open(output_path, 'w+') as f:
            result = run(CMD,
                         stdout=f,
                         stderr=PIPE,
                         cwd=link_dir,
                         timeout=timeout)
        end()
        if result.returncode:
            print('     ', (result.stderr).decode())
            raise Exception('Failed to fetch DOM')
        chmod_file('output.html', cwd=link_dir)
        output = 'output.html'
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)

    return {
        'cmd': CMD,
        'output': output,
    }
Esempio n. 8
0
def write_html_link_index(out_dir, link):
    with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    print('      √ index.html')

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **link,
            **link['latest'],
            'type': link['type'] or 'website',
            'tags': link['tags'] or 'untagged',
            'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
            'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'),
            'archive_org': link['latest'].get('archive_org') or 'https://web.archive.org/save/{}'.format(link['url']),
            'wget': link['latest'].get('wget') or wget_output_path(link),
        }))

    chmod_file(path)
Esempio n. 9
0
def fetch_wget(link_dir, link, timeout=TIMEOUT):
    """download full site using wget"""

    if FETCH_WARC:
        warc_dir = os.path.join(link_dir, 'warc')
        os.makedirs(warc_dir, exist_ok=True)
        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    output = None
    cmd = [
        WGET_BINARY,
        # '--server-response',  # print headers for better error parsing
        '--no-verbose',
        '--adjust-extension',
        '--convert-links',
        '--force-directories',
        '--backup-converted',
        '--span-hosts',
        '--no-parent',
        '-e', 'robots=off',
        *(('--restrict-file-names={}'.format(RESTRICT_FILE_NAMES),) if RESTRICT_FILE_NAMES else ()),
        '--timeout={}'.format(timeout),
        *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()),
        *(() if FETCH_WARC else ('--timestamping',)),
        *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
        *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
        *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
        *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
        *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
        link['url'],
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
        output = wget_output_path(link)

        # parse out number of files downloaded from last line of stderr:
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        output_tail = [
            line.strip()
            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
            if line.strip()
        ]
        files_downloaded = (
            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
            if 'Downloaded:' in output_tail[-1]
            else 0
        )

        # Check for common failure cases
        if result.returncode > 0 and files_downloaded < 1:
            hints = (
                'Got wget response code: {}.'.format(result.returncode),
                *output_tail,
            )
            if b'403: Forbidden' in result.stderr:
                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
            if b'404: Not Found' in result.stderr:
                raise ArchiveError('404 Not Found', hints)
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise ArchiveError('500 Internal Server Error', hints)
            raise ArchiveError('Got an error from the server', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
Esempio n. 10
0
def should_fetch_wget(link_dir, link):
    output_path = wget_output_path(link)
    if output_path and os.path.exists(os.path.join(link_dir, output_path)):
        return False

    return FETCH_WGET
Esempio n. 11
0
def fetch_wget(link_dir,
               link,
               requisites=FETCH_WGET_REQUISITES,
               warc=FETCH_WARC,
               timeout=TIMEOUT):
    """download full site using wget"""

    domain_dir = os.path.join(link_dir, domain(link['url']))
    existing_file = wget_output_path(link)
    if os.path.exists(domain_dir) and existing_file:
        return {'output': existing_file, 'status': 'skipped'}

    if warc:
        warc_dir = os.path.join(link_dir, 'warc')
        os.makedirs(warc_dir, exist_ok=True)
        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    CMD = [
        WGET_BINARY,
        # '--server-response',  # print headers for better error parsing
        '--no-verbose',
        '--adjust-extension',
        '--convert-links',
        '--force-directories',
        '--backup-converted',
        '--span-hosts',
        '--no-parent',
        '-e',
        'robots=off',
        '--restrict-file-names=unix',
        '--timeout={}'.format(timeout),
        *(() if warc else ('--timestamping', )),
        *(('--warc-file={}'.format(warc_path), ) if warc else ()),
        *(('--page-requisites', ) if FETCH_WGET_REQUISITES else ()),
        *(('--user-agent={}'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else
          ()),
        *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
        *((() if CHECK_SSL_VALIDITY else
           ('--no-check-certificate', '--no-hsts'))),
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)
        end()
        output = wget_output_path(link, look_in=domain_dir)

        output_tail = [
            '          ' + line
            for line in (result.stdout +
                         result.stderr).decode().rsplit('\n', 3)[-3:]
            if line.strip()
        ]

        # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        files_downloaded = (int(output_tail[-1].strip().split(' ', 2)[1] or 0)
                            if 'Downloaded:' in output_tail[-1] else 0)

        # Check for common failure cases
        if result.returncode > 0 and files_downloaded < 1:
            print('        Got wget response code {}:'.format(
                result.returncode))
            print('\n'.join(output_tail))
            if b'403: Forbidden' in result.stderr:
                raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
            if b'404: Not Found' in result.stderr:
                raise Exception('404 Not Found')
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise Exception('500 Internal Server Error')
            raise Exception('Got an error from the server')
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)

    return {
        'cmd': CMD,
        'output': output,
    }