Exemple #1
0
def write_json_link_index(out_dir, link):
    """write a json file with some info about the link"""
    
    check_link_structure(link)
    path = os.path.join(out_dir, 'index.json')

    print('      √ index.json')

    with open(path, 'w', encoding='utf-8') as f:
        json.dump(link, f, indent=4, default=str)

    chmod_file(path)
Exemple #2
0
def fetch_screenshot(link_dir,
                     link,
                     timeout=TIMEOUT,
                     user_data_dir=CHROME_USER_DATA_DIR,
                     resolution=RESOLUTION):
    """take screenshot of site using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
        return {'output': 'screenshot.png', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--screenshot',
        '--window-size={}'.format(resolution),
        '--hide-scrollbars',
        '--timeout={}'.format((timeout) * 1000),
        *(() if CHECK_SSL_VALIDITY else
          ('--disable-web-security', '--ignore-certificate-errors')),
        # '--full-page',   # TODO: make this actually work using ./bin/screenshot fullPage: true
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)  # sreenshot.png
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to take screenshot')
        chmod_file('screenshot.png', cwd=link_dir)
        output = 'screenshot.png'
    except Exception as e:
        end()
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        print('        Run to see full output:')
        print('            cd {};'.format(link_dir))
        print('            {}'.format(' '.join(CMD)))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #3
0
def archive_dot_org(link_dir, link, timeout=TIMEOUT):
    """submit site to archive.org for archiving via their service, save returned archive url"""

    path = os.path.join(link_dir, 'archive.org.txt')
    if os.path.exists(path):
        archive_org_url = open(path, 'r').read().strip()
        return {'output': archive_org_url, 'status': 'skipped'}

    submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])

    success = False
    CMD = ['curl', '-I', submit_url]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1)  # archive.org.txt
        end()

        # Parse archive.org response headers
        headers = result.stdout.splitlines()
        content_location = [h for h in headers if b'Content-Location: ' in h]
        errors = [h for h in headers if h and b'X-Archive-Wayback-Runtime-Error: ' in h]

        if content_location:
            archive_path = content_location[0].split(b'Content-Location: ', 1)[-1].decode('utf-8')
            saved_url = 'https://web.archive.org{}'.format(archive_path)
            success = True

        elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]:
            output = submit_url
            # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
        elif errors:
            raise Exception(', '.join(e.decode() for e in errors))
        else:
            raise Exception('Failed to find "Content-Location" URL header in Archive.org response.')
    except Exception as e:
        end()
        print('       Visit url to see output:', ' '.join(CMD))
        print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
        output = e

    if success:
        with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
            f.write(saved_url)
        chmod_file('archive.org.txt', cwd=link_dir)
        output = saved_url

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #4
0
def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    print('      √ index.html')

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **derived_link_info(link),
            # **link['latest'],
        }))

    chmod_file(path)
Exemple #5
0
def fetch_pdf(link_dir,
              link,
              timeout=TIMEOUT,
              user_data_dir=CHROME_USER_DATA_DIR):
    """print PDF of site to file using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    if os.path.exists(os.path.join(link_dir, 'output.pdf')):
        return {'output': 'output.pdf', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir), '--print-to-pdf',
        '--hide-scrollbars', '--timeout={}'.format((timeout) * 1000),
        *(() if CHECK_SSL_VALIDITY else
          ('--disable-web-security', '--ignore-certificate-errors')),
        link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)  # output.pdf
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to print PDF')
        chmod_file('output.pdf', cwd=link_dir)
        output = 'output.pdf'
    except Exception as e:
        end()
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        print('        Run to see full output:')
        print('            cd {};'.format(link_dir))
        print('            {}'.format(' '.join(CMD)))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
def fetch_screenshot(link_dir,
                     link,
                     timeout=TIMEOUT,
                     user_data_dir=CHROME_USER_DATA_DIR,
                     resolution=RESOLUTION,
                     firefox_profile=FIREFOX_PROFILE):
    """take screenshot of site using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
        return {'output': 'screenshot.png', 'status': 'skipped'}

    CMD = [
        *firefox_headless(profile=firefox_profile),
        '--screenshot',
        link['url'],
    ]

    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout + 1)  # sreenshot.png
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to take screenshot')
        chmod_file('screenshot.png', cwd=link_dir)
        output = 'screenshot.png'
    except Exception as e:
        end()
        print('        Run to see full output:',
              'cd {}; {}'.format(link_dir, ' '.join(CMD)))
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #7
0
def fetch_dom(link_dir,
              link,
              timeout=TIMEOUT,
              user_data_dir=CHROME_USER_DATA_DIR):
    """print HTML of site to file using chrome --dump-html"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    output_path = os.path.join(link_dir, 'output.html')

    if os.path.exists(output_path):
        return {'output': 'output.html', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir), '--dump-dom',
        '--timeout={}'.format((timeout) * 1000), link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        with open(output_path, 'w+') as f:
            result = run(CMD,
                         stdout=f,
                         stderr=PIPE,
                         cwd=link_dir,
                         timeout=timeout)  # output.html
        end()
        if result.returncode:
            print('     ', (result.stderr).decode())
            raise Exception('Failed to fetch DOM')
        chmod_file('output.html', cwd=link_dir)
        output = 'output.html'
    except Exception as e:
        end()
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        print('        Run to see full output:')
        print('            cd {};'.format(link_dir))
        print('            {}'.format(' '.join(CMD)))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #8
0
def write_json_links_index(out_dir, links):
    """write the json link index to a given path"""

    path = os.path.join(out_dir, 'index.json')

    index_json = {
        'info': 'Bookmark Archiver Index',
        'help': 'https://github.com/pirate/bookmark-archiver',
        'version': GIT_SHA,
        'num_links': len(links),
        'updated': str(datetime.now().timestamp()),
        'links': links,
    }

    with open(path, 'w', encoding='utf-8') as f:
        json.dump(index_json, f, indent=4, default=str)

    chmod_file(path)
Exemple #9
0
def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
    """take screenshot of site using chrome --headless"""

    output = 'screenshot.png'
    if CHROME_AVAILABLE:
        cmd = [
            *chrome_args(TIMEOUT=timeout),
            '--screenshot',
            link['url'],
        ]
    elif FIREFOX_AVAILABLE:
        cmd = [
            *firefox_args(),
            '--screenshot',
            link['url'],
        ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)

        if result.returncode:
            hints = (result.stderr or result.stdout).decode()
            raise ArchiveError('Failed to take screenshot', hints)

        chmod_file(output, cwd=link_dir)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
Exemple #10
0
def fetch_pdf(link_dir,
              link,
              timeout=TIMEOUT,
              user_data_dir=CHROME_USER_DATA_DIR):
    """print PDF of site to file using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': html_appended_url(link)}

    if os.path.exists(os.path.join(link_dir, 'output.pdf')):
        return {'output': 'output.pdf', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir), '--print-to-pdf',
        link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout + 1)  # output.pdf
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to print PDF')
        chmod_file('output.pdf', cwd=link_dir)
        output = 'output.pdf'
    except Exception as e:
        end()
        print('       Run to see full output:',
              'cd {}; {}'.format(link_dir, ' '.join(CMD)))
        print('       {}Failed: {} {}{}'.format(ANSI['red'],
                                                e.__class__.__name__, e,
                                                ANSI['reset']))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #11
0
def write_html_link_index(out_dir, link):
    with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    print('      √ index.html')

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **link,
            **link['latest'],
            'type': link['type'] or 'website',
            'tags': link['tags'] or 'untagged',
            'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
            'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'),
            'archive_org': link['latest'].get('archive_org') or 'https://web.archive.org/save/{}'.format(link['url']),
            'wget': link['latest'].get('wget') or wget_output_path(link),
        }))

    chmod_file(path)
Exemple #12
0
def write_html_links_index(out_dir, links):
    """write the html link index to a given path"""

    check_links_structure(links)

    path = os.path.join(out_dir, 'index.html')

    copy_tree(os.path.join(TEMPLATES_DIR, 'static'),
              os.path.join(out_dir, 'static'))

    with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f:
        f.write('User-agent: *\nDisallow: /')

    with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r',
              encoding='utf-8') as f:
        index_html = f.read()

    with open(os.path.join(TEMPLATES_DIR, 'index_row.html'),
              'r',
              encoding='utf-8') as f:
        link_row_html = f.read()

    link_rows = '\n'.join(
        Template(link_row_html).substitute(**derived_link_info(link))
        for link in links)

    template_vars = {
        'num_links': len(links),
        'date_updated': datetime.now().strftime('%Y-%m-%d'),
        'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
        'footer_info': FOOTER_INFO,
        'git_sha': GIT_SHA,
        'short_git_sha': GIT_SHA[:8],
        'rows': link_rows,
    }

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(index_html).substitute(**template_vars))

    chmod_file(path)
Exemple #13
0
def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
    """print HTML of site to file using chrome --dump-html"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}
    
    output_path = os.path.join(link_dir, 'output.html')

    if os.path.exists(output_path):
        return {'output': 'output.html', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--dump-dom',
        '--timeout={}'.format((timeout) * 1000),
        *(('--user-agent={}'.format(CHROME_USER_AGENT),) if CHROME_USER_AGENT else ()),
        link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        with open(output_path, 'w+') as f:
            result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout)
        end()
        if result.returncode:
            print('     ', (result.stderr).decode())
            raise Exception('Failed to fetch DOM')
        chmod_file('output.html', cwd=link_dir)
        output = 'output.html'
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #14
0
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
    """download site favicon from google's favicon api"""

    if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
        return {'output': 'favicon.ico', 'status': 'skipped'}

    CMD = [
        CURL_BINARY,
        '--max-time',
        str(timeout),
        *(() if CHECK_SSL_VALIDITY else ('--insecure', )),
        'https://www.google.com/s2/favicons?domain={}'.format(
            domain(link['url'])),
    ]
    fout = open('{}/favicon.ico'.format(link_dir), 'w')
    end = progress(timeout, prefix='      ')
    try:
        run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir,
            timeout=timeout)  # favicon.ico
        fout.close()
        end()
        chmod_file('favicon.ico', cwd=link_dir)
        output = 'favicon.ico'
    except Exception as e:
        fout.close()
        end()
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        print('        Run to see full output:')
        print('            {}'.format(' '.join(CMD)))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #15
0
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
    """print PDF of site to file using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}
    
    if os.path.exists(os.path.join(link_dir, 'output.pdf')):
        return {'output': 'output.pdf', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--print-to-pdf',
        '--hide-scrollbars',
        '--timeout={}'.format((timeout) * 1000),
        *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
        *(('--user-agent={}'.format(CHROME_USER_AGENT),) if CHROME_USER_AGENT else ()),
        link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to print PDF')
        chmod_file('output.pdf', cwd=link_dir)
        output = 'output.pdf'
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #16
0
def fetch_pdf(link_dir, link, timeout=TIMEOUT):
    """print PDF of site to file using chrome --headless"""

    output = 'output.pdf'
    cmd = [
        *chrome_args(TIMEOUT=timeout),
        '--print-to-pdf',
        link['url'],
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)

        if result.returncode:
            hints = (result.stderr or result.stdout).decode()
            raise ArchiveError('Failed to print PDF', hints)

        chmod_file('output.pdf', cwd=link_dir)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
Exemple #17
0
def archive_dot_org(link_dir, link, timeout=TIMEOUT):
    """submit site to archive.org for archiving via their service, save returned archive url"""

    path = os.path.join(link_dir, 'archive.org.txt')
    if os.path.exists(path):
        archive_org_url = open(path, 'r').read().strip()
        return {'output': archive_org_url, 'status': 'skipped'}

    submit_url = 'https://web.archive.org/save/{}'.format(link['url'])

    success = False
    CMD = [
        'curl',
        '--location',
        '--head',
        '--max-time',
        str(timeout),
        '--get',
        *(() if CHECK_SSL_VALIDITY else ('--insecure', )),
        submit_url,
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=DEVNULL,
                     cwd=link_dir,
                     timeout=timeout)  # archive.org.txt
        end()

        # Parse archive.org response headers
        headers = defaultdict(list)

        # lowercase all the header names and store in dict
        for header in result.stdout.splitlines():
            if b':' not in header or not header.strip():
                continue
            name, val = header.decode().split(':', 1)
            headers[name.lower().strip()].append(val.strip())

        # Get successful archive url in "content-location" header or any errors
        content_location = headers['content-location']
        errors = headers['x-archive-wayback-runtime-error']

        if content_location:
            saved_url = 'https://web.archive.org{}'.format(content_location[0])
            success = True
        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
            output = submit_url
            # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
        elif errors:
            raise Exception(', '.join(errors))
        else:
            raise Exception(
                'Failed to find "content-location" URL header in Archive.org response.'
            )
    except Exception as e:
        end()
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        print('        Run to see full output:')
        print('            {}'.format(' '.join(CMD)))
        output = e

    if success:
        with open(os.path.join(link_dir, 'archive.org.txt'),
                  'w',
                  encoding='utf-8') as f:
            f.write(saved_url)
        chmod_file('archive.org.txt', cwd=link_dir)
        output = saved_url

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #18
0
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
    """Download playlists or individual video, audio, and subtitles using youtube-dl"""

    # import ipdb; ipdb.set_trace()
    output = os.path.join(link_dir, 'media')
    already_done = os.path.exists(output)  # and os.listdir(output)
    if already_done and not overwrite:
        return {'output': 'media', 'status': 'skipped'}

    os.makedirs(output, exist_ok=True)
    CMD = [
        YOUTUBEDL_BINARY,
        '--write-description',
        '--write-info-json',
        '--write-annotations',
        '--yes-playlist',
        '--write-thumbnail',
        '--no-call-home',
        '--no-check-certificate',
        '--user-agent',
        '--all-subs',
        '--extract-audio',
        '--keep-video',
        '--ignore-errors',
        '--geo-bypass',
        '--audio-format',
        'mp3',
        '--audio-quality',
        '320K',
        '--embed-thumbnail',
        '--add-metadata',
        *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate', )),
        link['url'],
    ]

    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=output,
                     timeout=timeout + 1)
        chmod_file('media', cwd=link_dir)
        output = 'media'
        end()
        if result.returncode:
            if (b'ERROR: Unsupported URL' in result.stderr
                    or b'HTTP Error 404' in result.stderr
                    or b'HTTP Error 403' in result.stderr
                    or b'URL could be a direct video link' in result.stderr
                    or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                print('        got youtubedl response code {}:'.format(
                    result.returncode))
                print(result.stderr)
                raise Exception('Failed to download media')
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #19
0
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
    """Download playlists or individual video, audio, and subtitles using youtube-dl"""

    # import ipdb; ipdb.set_trace()
    output = os.path.join(link_dir, 'media')
    already_done = os.path.exists(output)  # and os.listdir(output)
    if already_done and not overwrite:
        return {'output': 'media', 'status': 'skipped'}

    os.makedirs(output, exist_ok=True)
    CMD = [
        'youtube-dl',
        '--write-description',
        '--write-info-json',
        '--write-annotations',
        '--yes-playlist',
        '--write-thumbnail',
        '--no-call-home',
        '--no-check-certificate',
        '--user-agent',
        '--all-subs',
        '-x',
        '-k',
        '--audio-format',
        'mp3',
        '--audio-quality',
        '320K',
        '--embed-thumbnail',
        '--add-metadata',
        link['url'],
    ]

    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=output,
                     timeout=timeout + 1)  # audio/audio.mp3
        chmod_file('media', cwd=link_dir)
        output = 'media'
        end()
        if result.returncode:
            if (b'ERROR: Unsupported URL' in result.stderr
                    or b'HTTP Error 404' in result.stderr
                    or b'HTTP Error 403' in result.stderr
                    or b'URL could be a direct video link' in result.stderr
                    or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                print('        got youtubedl response code {}:'.format(
                    result.returncode))
                print(result.stderr)
                raise Exception('Failed to download media')
    except Exception as e:
        end()
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        print('        Run to see full output:')
        print('            cd {};'.format(link_dir))
        print('            {}'.format(' '.join(CMD)))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #20
0
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
    """Download playlists or individual video, audio, and subtitles using youtube-dl"""

    output = 'media'
    output_path = os.path.join(link_dir, 'media')
    os.makedirs(output_path, exist_ok=True)
    cmd = [
        YOUTUBEDL_BINARY,
        '--write-description',
        '--write-info-json',
        '--write-annotations',
        '--yes-playlist',
        '--write-thumbnail',
        '--no-call-home',
        '--no-check-certificate',
        '--user-agent',
        '--all-subs',
        '--extract-audio',
        '--keep-video',
        '--ignore-errors',
        '--geo-bypass',
        '--audio-format',
        'mp3',
        '--audio-quality',
        '320K',
        '--embed-thumbnail',
        '--add-metadata',
        *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate', )),
        link['url'],
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=output_path,
                     timeout=timeout + 1)
        chmod_file(output, cwd=link_dir)
        if result.returncode:
            if (b'ERROR: Unsupported URL' in result.stderr
                    or b'HTTP Error 404' in result.stderr
                    or b'HTTP Error 403' in result.stderr
                    or b'URL could be a direct video link' in result.stderr
                    or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                hints = (
                    'Got youtube-dl response code: {}.'.format(
                        result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to download media', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
Exemple #21
0
def archive_dot_org(link_dir, link, timeout=TIMEOUT):
    """submit site to archive.org for archiving via their service, save returned archive url"""

    output = 'archive.org.txt'
    archive_org_url = None
    submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
    cmd = [
        CURL_BINARY,
        '--location',
        '--head',
        '--user-agent',
        'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(
            GIT_SHA
        ),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
        '--max-time',
        str(timeout),
        *(() if CHECK_SSL_VALIDITY else ('--insecure', )),
        submit_url,
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd,
                     stdout=PIPE,
                     stderr=DEVNULL,
                     cwd=link_dir,
                     timeout=timeout)
        content_location, errors = parse_archive_dot_org_response(
            result.stdout)
        if content_location:
            archive_org_url = 'https://web.archive.org{}'.format(
                content_location[0])
        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
            archive_org_url = None
            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
        elif errors:
            raise ArchiveError(', '.join(errors))
        else:
            raise ArchiveError(
                'Failed to find "content-location" URL header in Archive.org response.'
            )
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    if not isinstance(output, Exception):
        # instead of writing None when archive.org rejects the url write the
        # url to resubmit it to archive.org. This is so when the user visits
        # the URL in person, it will attempt to re-archive it, and it'll show the
        # nicer error message explaining why the url was rejected if it fails.
        archive_org_url = archive_org_url or submit_url
        with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
            f.write(archive_org_url)
        chmod_file('archive.org.txt', cwd=link_dir)
        output = archive_org_url

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }