def fetch_wget(link_dir,
               link,
               requisites=FETCH_WGET_REQUISITES,
               timeout=TIMEOUT):
    """download full site using wget"""

    domain_dir = os.path.join(link_dir, link['domain'])
    existing_file = wget_output_path(link)
    if os.path.exists(domain_dir) and existing_file:
        return {'output': existing_file, 'status': 'skipped'}

    CMD = [
        # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
        *'wget -N -E -np -x -H -k -K -S --restrict-file-names=unix'.split(' '),
        *(('-p', ) if FETCH_WGET_REQUISITES else ()),
        *(('--user-agent={}'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else
          ()),
        *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', ))),
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout + 1)  # index.html
        end()
        output = wget_output_path(link, look_in=domain_dir)

        # Check for common failure cases
        if result.returncode > 0:
            print('        got wget response code {}:'.format(
                result.returncode))
            if result.returncode != 8:
                print('\n'.join('          ' + line
                                for line in (result.stderr or result.stdout
                                             ).decode().rsplit('\n', 10)[-10:]
                                if line.strip()))
            if b'403: Forbidden' in result.stderr:
                raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
            if b'404: Not Found' in result.stderr:
                raise Exception('404 Not Found')
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise Exception('500 Internal Server Error')
            if result.returncode == 4:
                raise Exception('Failed wget download')
    except Exception as e:
        end()
        print('        Run to see full output:',
              'cd {}; {}'.format(link_dir, ' '.join(CMD)))
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #2
0
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
    """print PDF of site to file using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}
    
    if os.path.exists(os.path.join(link_dir, 'output.pdf')):
        return {'output': 'output.pdf', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--print-to-pdf',
        link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1)  # output.pdf
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to print PDF')
        chmod_file('output.pdf', cwd=link_dir)
        output = 'output.pdf'
    except Exception as e:
        end()
        print('        Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #3
0
def fetch_screenshot(link_dir,
                     link,
                     timeout=TIMEOUT,
                     user_data_dir=CHROME_USER_DATA_DIR,
                     resolution=RESOLUTION):
    """take screenshot of site using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
        return {'output': 'screenshot.png', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--screenshot',
        '--window-size={}'.format(resolution),
        '--hide-scrollbars',
        '--timeout={}'.format((timeout) * 1000),
        *(() if CHECK_SSL_VALIDITY else
          ('--disable-web-security', '--ignore-certificate-errors')),
        # '--full-page',   # TODO: make this actually work using ./bin/screenshot fullPage: true
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)  # sreenshot.png
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to take screenshot')
        chmod_file('screenshot.png', cwd=link_dir)
        output = 'screenshot.png'
    except Exception as e:
        end()
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        print('        Run to see full output:')
        print('            cd {};'.format(link_dir))
        print('            {}'.format(' '.join(CMD)))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #4
0
def fetch_dom(link_dir,
              link,
              timeout=TIMEOUT,
              user_data_dir=CHROME_USER_DATA_DIR):
    """print HTML of site to file using chrome --dump-html"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    output_path = os.path.join(link_dir, 'output.html')

    if os.path.exists(output_path):
        return {'output': 'output.html', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir), '--dump-dom',
        '--timeout={}'.format((timeout) * 1000), link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        with open(output_path, 'w+') as f:
            result = run(CMD,
                         stdout=f,
                         stderr=PIPE,
                         cwd=link_dir,
                         timeout=timeout)  # output.html
        end()
        if result.returncode:
            print('     ', (result.stderr).decode())
            raise Exception('Failed to fetch DOM')
        chmod_file('output.html', cwd=link_dir)
        output = 'output.html'
    except Exception as e:
        end()
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        print('        Run to see full output:')
        print('            cd {};'.format(link_dir))
        print('            {}'.format(' '.join(CMD)))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
def fetch_screenshot(link_dir,
                     link,
                     timeout=TIMEOUT,
                     user_data_dir=CHROME_USER_DATA_DIR,
                     resolution=RESOLUTION,
                     firefox_profile=FIREFOX_PROFILE):
    """take screenshot of site using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': wget_output_path(link)}

    if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
        return {'output': 'screenshot.png', 'status': 'skipped'}

    CMD = [
        *firefox_headless(profile=firefox_profile),
        '--screenshot',
        link['url'],
    ]

    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout + 1)  # sreenshot.png
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to take screenshot')
        chmod_file('screenshot.png', cwd=link_dir)
        output = 'screenshot.png'
    except Exception as e:
        end()
        print('        Run to see full output:',
              'cd {}; {}'.format(link_dir, ' '.join(CMD)))
        print('        {}Failed: {} {}{}'.format(ANSI['red'],
                                                 e.__class__.__name__, e,
                                                 ANSI['reset']))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #6
0
def fetch_wget(link_dir,
               link,
               requisites=FETCH_WGET_REQUISITES,
               warc=FETCH_WARC,
               timeout=TIMEOUT):
    """download full site using wget"""

    domain_dir = os.path.join(link_dir, link['domain'])
    existing_file = wget_output_path(link)
    if os.path.exists(domain_dir) and existing_file:
        return {'output': existing_file, 'status': 'skipped'}

    if warc:
        warc_dir = os.path.join(link_dir, 'warc')
        os.makedirs(warc_dir, exist_ok=True)
        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    CMD = [
        'wget',
        # '--server-response',  # print headers for better error parsing
        '--no-verbose',
        '--adjust-extension',
        '--convert-links',
        '--force-directories',
        '--backup-converted',
        '--span-hosts',
        '--no-parent',
        '--restrict-file-names=unix',
        '--timeout={}'.format(timeout),
        *(() if warc else ('--timestamping', )),
        *(('--warc-file={}'.format(warc_path), ) if warc else ()),
        *(('--page-requisites', ) if FETCH_WGET_REQUISITES else ()),
        *(('--user-agent="{}"'.format(WGET_USER_AGENT), )
          if WGET_USER_AGENT else ()),
        *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', ))),
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)  # index.html
        end()
        output = wget_output_path(link, look_in=domain_dir)

        # Check for common failure cases
        if result.returncode > 0:
            print('        Got wget response code {}:'.format(
                result.returncode))
            print('\n'.join(
                '          ' + line
                for line in (result.stdout +
                             result.stderr).decode().rsplit('\n', 3)[-3:]
                if line.strip()))
            if b'403: Forbidden' in result.stderr:
                raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
            if b'404: Not Found' in result.stderr:
                raise Exception('404 Not Found')
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise Exception('500 Internal Server Error')
            raise Exception('Got an error from the server')
    except Exception as e:
        end()
        print('        {}Some resources were skipped: {}{}'.format(
            ANSI['lightyellow'], e, ANSI['reset']))
        print('        Run to see full output:')
        print('            cd {};'.format(link_dir))
        print('            {}'.format(' '.join(CMD)))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Exemple #7
0
def fetch_wget(link_dir,
               link,
               requisites=FETCH_WGET_REQUISITES,
               warc=FETCH_WARC,
               timeout=TIMEOUT):
    """download full site using wget"""

    domain_dir = os.path.join(link_dir, link['domain'])
    existing_file = wget_output_path(link)
    if os.path.exists(domain_dir) and existing_file:
        return {'output': existing_file, 'status': 'skipped'}

    if warc:
        warc_dir = os.path.join(link_dir, 'warc')
        os.makedirs(warc_dir, exist_ok=True)
        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    CMD = [
        WGET_BINARY,
        # '--server-response',  # print headers for better error parsing
        '--no-verbose',
        '--adjust-extension',
        '--convert-links',
        '--force-directories',
        '--backup-converted',
        '--span-hosts',
        '--no-parent',
        '-e',
        'robots=off',
        '--restrict-file-names=unix',
        '--timeout={}'.format(timeout),
        *(() if warc else ('--timestamping', )),
        *(('--warc-file={}'.format(warc_path), ) if warc else ()),
        *(('--page-requisites', ) if FETCH_WGET_REQUISITES else ()),
        *(('--user-agent={}'.format(WGET_USER_AGENT), ) if WGET_USER_AGENT else
          ()),
        *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
        *((() if CHECK_SSL_VALIDITY else
           ('--no-check-certificate', '--no-hsts'))),
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout)  # index.html
        end()
        output = wget_output_path(link, look_in=domain_dir)

        output_tail = [
            '          ' + line
            for line in (result.stdout +
                         result.stderr).decode().rsplit('\n', 3)[-3:]
            if line.strip()
        ]

        # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        files_downloaded = (int(output_tail[-1].strip().split(' ', 2)[1] or 0)
                            if 'Downloaded:' in output_tail[-1] else 0)

        # Check for common failure cases
        if result.returncode > 0 and files_downloaded < 1:
            print('        Got wget response code {}:'.format(
                result.returncode))
            print('\n'.join(output_tail))
            if b'403: Forbidden' in result.stderr:
                raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
            if b'404: Not Found' in result.stderr:
                raise Exception('404 Not Found')
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise Exception('500 Internal Server Error')
            raise Exception('Got an error from the server')
    except Exception as e:
        end()

        # to let the user copy-paste the command and run it safely we have
        # to quote some of the arguments that could have spaces in them
        quoted_cmd = ' '.join(CMD)
        quoted_cmd = quoted_cmd.replace(WGET_USER_AGENT,
                                        '"{}"'.format(WGET_USER_AGENT))
        if COOKIES_FILE:
            quoted_cmd = quoted_cmd.replace(COOKIES_FILE,
                                            '"{}"'.format(COOKIES_FILE))

        print('        {}Some resources were skipped: {}{}'.format(
            ANSI['lightyellow'], e, ANSI['reset']))
        print('        Run to see full output:')
        print('            cd {};'.format(link_dir))
        print('            {}'.format(quoted_cmd))
        output = e
    return {
        'cmd': CMD,
        'output': output,
    }