def write_html_link_index(out_dir, link): check_link_structure(link) with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: link_html = f.read() path = os.path.join(out_dir, 'index.html') link = derived_link_info(link) with open(path, 'w', encoding='utf-8') as f: f.write(Template(link_html).substitute({ **link, 'title': ( link['title'] or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) ), 'archive_url': urlencode( wget_output_path(link) or (link['domain'] if link['is_archived'] else 'about:blank') ), 'extension': link['extension'] or 'html', 'tags': link['tags'].strip() or 'untagged', 'status': 'Archived' if link['is_archived'] else 'Not yet archived', 'status_color': 'success' if link['is_archived'] else 'danger', })) chmod_file(path)
def validate_links(links): links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links( links) # deterministically sort the links based on timstamp, url if not links: print('[X] No links found :(') raise SystemExit(1) for link in links: link['title'] = unescape(link['title']) link['latest'] = link.get('latest') or {} if not link['latest'].get('wget'): link['latest']['wget'] = wget_output_path(link) if not link['latest'].get('pdf'): link['latest']['pdf'] = None if not link['latest'].get('screenshot'): link['latest']['screenshot'] = None if not link['latest'].get('dom'): link['latest']['dom'] = None return list(links)
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): """print PDF of site to file using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': wget_output_path(link)} if os.path.exists(os.path.join(link_dir, 'output.pdf')): return {'output': 'output.pdf', 'status': 'skipped'} CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--print-to-pdf', '--hide-scrollbars', '--timeout={}'.format((timeout) * 1000), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), link['url'] ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to print PDF') chmod_file('output.pdf', cwd=link_dir) output = 'output.pdf' except Exception as e: end() output = e print_error_hints(cmd=CMD, pwd=link_dir, err=e) return { 'cmd': CMD, 'output': output, }
def write_html_link_index(out_dir, link): check_link_structure(link) with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f: link_html = f.read() path = os.path.join(out_dir, 'index.html') print(' √ index.html') link = derived_link_info(link) with open(path, 'w', encoding='utf-8') as f: f.write(Template(link_html).substitute({ **link, 'title': ( link['title'] or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) ), 'archive_url': ( wget_output_path(link) or (link['domain'] if link['is_archived'] else 'about:blank') ), })) chmod_file(path)
def write_html_links_index(out_dir, links, finished=False): """write the html link index to a given path""" check_links_structure(links) path = os.path.join(out_dir, 'index.html') copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static')) with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f: f.write('User-agent: *\nDisallow: /') with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f: index_html = f.read() with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f: link_row_html = f.read() full_links_info = (derived_link_info(link) for link in links) link_rows = '\n'.join( Template(link_row_html).substitute( **{ **link, 'title': ( link['title'] or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)), 'favicon_url': ( os.path.join('archive', link['timestamp'], 'favicon.ico') # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs=' ), 'archive_url': urlencode(wget_output_path(link) or 'index.html'), }) for link in full_links_info) template_vars = { 'num_links': len(links), 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), 'footer_info': FOOTER_INFO, 'git_sha': GIT_SHA, 'short_git_sha': GIT_SHA[:8], 'rows': link_rows, 'status': 'finished' if finished else 'running', } with open(path, 'w', encoding='utf-8') as f: f.write(Template(index_html).substitute(**template_vars)) chmod_file(path)
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION): """take screenshot of site using chrome --headless""" if link['type'] in ('PDF', 'image'): return {'output': wget_output_path(link)} if os.path.exists(os.path.join(link_dir, 'screenshot.png')): return {'output': 'screenshot.png', 'status': 'skipped'} CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--screenshot', '--window-size={}'.format(resolution), '--hide-scrollbars', '--timeout={}'.format((timeout) * 1000), *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')), # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) end() if result.returncode: print(' ', (result.stderr or result.stdout).decode()) raise Exception('Failed to take screenshot') chmod_file('screenshot.png', cwd=link_dir) output = 'screenshot.png' except Exception as e: end() output = e print_error_hints(cmd=CMD, pwd=link_dir, err=e) return { 'cmd': CMD, 'output': output, }
def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): """print HTML of site to file using chrome --dump-html""" if link['type'] in ('PDF', 'image'): return {'output': wget_output_path(link)} output_path = os.path.join(link_dir, 'output.html') if os.path.exists(output_path): return {'output': 'output.html', 'status': 'skipped'} CMD = [ *chrome_headless(user_data_dir=user_data_dir), '--dump-dom', '--timeout={}'.format((timeout) * 1000), link['url'] ] end = progress(timeout, prefix=' ') try: with open(output_path, 'w+') as f: result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) end() if result.returncode: print(' ', (result.stderr).decode()) raise Exception('Failed to fetch DOM') chmod_file('output.html', cwd=link_dir) output = 'output.html' except Exception as e: end() output = e print_error_hints(cmd=CMD, pwd=link_dir, err=e) return { 'cmd': CMD, 'output': output, }
def write_html_link_index(out_dir, link): with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f: link_html = f.read() path = os.path.join(out_dir, 'index.html') print(' √ index.html') with open(path, 'w', encoding='utf-8') as f: f.write(Template(link_html).substitute({ **link, **link['latest'], 'type': link['type'] or 'website', 'tags': link['tags'] or 'untagged', 'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), 'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'), 'archive_org': link['latest'].get('archive_org') or 'https://web.archive.org/save/{}'.format(link['url']), 'wget': link['latest'].get('wget') or wget_output_path(link), })) chmod_file(path)
def fetch_wget(link_dir, link, timeout=TIMEOUT): """download full site using wget""" if FETCH_WARC: warc_dir = os.path.join(link_dir, 'warc') os.makedirs(warc_dir, exist_ok=True) warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html output = None cmd = [ WGET_BINARY, # '--server-response', # print headers for better error parsing '--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off', *(('--restrict-file-names={}'.format(RESTRICT_FILE_NAMES),) if RESTRICT_FILE_NAMES else ()), '--timeout={}'.format(timeout), *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()), *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))), link['url'], ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) output = wget_output_path(link) # parse out number of files downloaded from last line of stderr: # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" output_tail = [ line.strip() for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip() ] files_downloaded = ( int(output_tail[-1].strip().split(' ', 2)[1] or 0) if 'Downloaded:' in output_tail[-1] else 0 ) # Check for common failure cases if result.returncode > 0 and files_downloaded < 1: hints = ( 'Got wget response code: {}.'.format(result.returncode), *output_tail, ) if b'403: Forbidden' in result.stderr: raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints) if b'404: Not Found' in result.stderr: raise ArchiveError('404 Not Found', hints) if b'ERROR 500: Internal Server Error' in result.stderr: raise ArchiveError('500 Internal Server Error', hints) raise ArchiveError('Got an error from the server', hints) except Exception as err: status = 'failed' output = err finally: timer.end() return { 'cmd': cmd, 'pwd': link_dir, 'output': output, 'status': status, **timer.stats, }
def should_fetch_wget(link_dir, link): output_path = wget_output_path(link) if output_path and os.path.exists(os.path.join(link_dir, output_path)): return False return FETCH_WGET
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT): """download full site using wget""" domain_dir = os.path.join(link_dir, domain(link['url'])) existing_file = wget_output_path(link) if os.path.exists(domain_dir) and existing_file: return {'output': existing_file, 'status': 'skipped'} if warc: warc_dir = os.path.join(link_dir, 'warc') os.makedirs(warc_dir, exist_ok=True) warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html CMD = [ WGET_BINARY, # '--server-response', # print headers for better error parsing '--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off', '--restrict-file-names=unix', '--timeout={}'.format(timeout), *(() if warc else ('--timestamping', )), *(('--warc-file={}'.format(warc_path), ) if warc else ()), *(('--page-requisites', ) if FETCH_WGET_REQUISITES else ()), *(('--user-agent={}'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else ()), *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()), *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))), link['url'], ] end = progress(timeout, prefix=' ') try: result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) end() output = wget_output_path(link, look_in=domain_dir) output_tail = [ ' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip() ] # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" files_downloaded = (int(output_tail[-1].strip().split(' ', 2)[1] or 0) if 'Downloaded:' in output_tail[-1] else 0) # Check for common failure cases if result.returncode > 0 and files_downloaded < 1: print(' Got wget response code {}:'.format( result.returncode)) print('\n'.join(output_tail)) if b'403: Forbidden' in result.stderr: raise Exception('403 Forbidden (try changing WGET_USER_AGENT)') if b'404: Not Found' in result.stderr: raise Exception('404 Not Found') if b'ERROR 500: Internal Server Error' in result.stderr: raise Exception('500 Internal Server Error') raise Exception('Got an error from the server') except Exception as e: end() output = e print_error_hints(cmd=CMD, pwd=link_dir, err=e) return { 'cmd': CMD, 'output': output, }