Ejemplo n.º 1
0
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
    """Download playlists or individual video, audio, and subtitles using youtube-dl"""

    output = 'media'
    output_path = os.path.join(link_dir, 'media')
    os.makedirs(output_path, exist_ok=True)
    cmd = [
        YOUTUBEDL_BINARY,
        '--write-description',
        '--write-info-json',
        '--write-annotations',
        '--yes-playlist',
        '--write-thumbnail',
        '--no-call-home',
        '--no-check-certificate',
        '--user-agent',
        '--all-subs',
        '--extract-audio',
        '--keep-video',
        '--ignore-errors',
        '--geo-bypass',
        '--audio-format',
        'mp3',
        '--audio-quality',
        '320K',
        '--embed-thumbnail',
        '--add-metadata',
        *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate', )),
        link['url'],
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=output_path,
                     timeout=timeout + 1)
        chmod_file(output, cwd=link_dir)
        if result.returncode:
            if (b'ERROR: Unsupported URL' in result.stderr
                    or b'HTTP Error 404' in result.stderr
                    or b'HTTP Error 403' in result.stderr
                    or b'URL could be a direct video link' in result.stderr
                    or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                hints = (
                    'Got youtube-dl response code: {}.'.format(
                        result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to download media', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
Ejemplo n.º 2
0
def archive_dot_org(link_dir, link, timeout=TIMEOUT):
    """submit site to archive.org for archiving via their service, save returned archive url"""

    output = 'archive.org.txt'
    archive_org_url = None
    submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
    cmd = [
        CURL_BINARY,
        '--location',
        '--head',
        '--user-agent',
        'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(
            GIT_SHA
        ),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
        '--max-time',
        str(timeout),
        *(() if CHECK_SSL_VALIDITY else ('--insecure', )),
        submit_url,
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd,
                     stdout=PIPE,
                     stderr=DEVNULL,
                     cwd=link_dir,
                     timeout=timeout)
        content_location, errors = parse_archive_dot_org_response(
            result.stdout)
        if content_location:
            archive_org_url = 'https://web.archive.org{}'.format(
                content_location[0])
        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
            archive_org_url = None
            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
        elif errors:
            raise ArchiveError(', '.join(errors))
        else:
            raise ArchiveError(
                'Failed to find "content-location" URL header in Archive.org response.'
            )
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    if not isinstance(output, Exception):
        # instead of writing None when archive.org rejects the url write the
        # url to resubmit it to archive.org. This is so when the user visits
        # the URL in person, it will attempt to re-archive it, and it'll show the
        # nicer error message explaining why the url was rejected if it fails.
        archive_org_url = archive_org_url or submit_url
        with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
            f.write(archive_org_url)
        chmod_file('archive.org.txt', cwd=link_dir)
        output = archive_org_url

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
Ejemplo n.º 3
0
def fetch_wget(link_dir, link, timeout=TIMEOUT):
    """download full site using wget"""

    if FETCH_WARC:
        warc_dir = os.path.join(link_dir, 'warc')
        os.makedirs(warc_dir, exist_ok=True)
        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    output = None
    cmd = [
        WGET_BINARY,
        # '--server-response',  # print headers for better error parsing
        '--no-verbose',
        '--adjust-extension',
        '--convert-links',
        '--force-directories',
        '--backup-converted',
        '--span-hosts',
        '--no-parent',
        '-e',
        'robots=off',
        '--restrict-file-names=unix',
        '--timeout={}'.format(timeout),
        *(('--compression=auto', ) if WGET_AUTO_COMPRESSION else ()),
        *(() if FETCH_WARC else ('--timestamping', )),
        *(('--warc-file={}'.format(warc_path), ) if FETCH_WARC else ()),
        *(('--page-requisites', ) if FETCH_WGET_REQUISITES else ()),
        *(('--user-agent={}'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else
          ()),
        *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
        *((() if CHECK_SSL_VALIDITY else
           ('--no-check-certificate', '--no-hsts'))),
        link['url'],
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)
        output = wget_output_path(link)

        # parse out number of files downloaded from last line of stderr:
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        output_tail = [
            line.strip()
            for line in (result.stdout +
                         result.stderr).decode().rsplit('\n', 3)[-3:]
            if line.strip()
        ]
        files_downloaded = (int(output_tail[-1].strip().split(' ', 2)[1] or 0)
                            if 'Downloaded:' in output_tail[-1] else 0)

        # Check for common failure cases
        if result.returncode > 0 and files_downloaded < 1:
            hints = (
                'Got wget response code: {}.'.format(result.returncode),
                *output_tail,
            )
            if b'403: Forbidden' in result.stderr:
                raise ArchiveError(
                    '403 Forbidden (try changing WGET_USER_AGENT)', hints)
            if b'404: Not Found' in result.stderr:
                raise ArchiveError('404 Not Found', hints)
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise ArchiveError('500 Internal Server Error', hints)
            raise ArchiveError('Got an error from the server', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }