Ejemplo n.º 1
0
def fetch_wget(link_dir,
               link,
               requisites=FETCH_WGET_REQUISITES,
               timeout=TIMEOUT):
    """download full site using wget"""

    if os.path.exists(os.path.join(link_dir, link['domain'])):
        return {'output': html_appended_url(link), 'status': 'skipped'}

    CMD = [
        *'wget --timestamping --adjust-extension --no-parent'.split(
            ' '),  # Docs: https://www.gnu.org/software/wget/manual/wget.html
        *(('--page-requisites',
           '--convert-links') if FETCH_WGET_REQUISITES else ()),
        *(('--user-agent="{}"'.format(WGET_USER_AGENT), )
          if WGET_USER_AGENT else ()),
        *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', ))),
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD,
                     stdout=PIPE,
                     stderr=PIPE,
                     cwd=link_dir,
                     timeout=timeout + 1)  # index.html
        end()
        output = html_appended_url(link)
        if result.returncode > 0:
            print('       got wget response code {}:'.format(
                result.returncode))
            print('\n'.join('         ' + line for line in (
                result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:]
                            if line.strip()))
            # raise Exception('Failed to wget download')
    except Exception as e:
        end()
        print('       Run to see full output:',
              'cd {}; {}'.format(link_dir, ' '.join(CMD)))
        print('       {}Failed: {} {}{}'.format(ANSI['red'],
                                                e.__class__.__name__, e,
                                                ANSI['reset']))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Ejemplo n.º 2
0
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
    """take screenshot of site using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': html_appended_url(link)}

    if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
        return {'output': 'screenshot.png', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--screenshot',
        '--window-size={}'.format(resolution),
        link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1)  # sreenshot.png
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to take screenshot')
        chmod_file('screenshot.png', cwd=link_dir)
        output = 'screenshot.png'
    except Exception as e:
        end()
        print('       Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
        print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }
Ejemplo n.º 3
0
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
    """print PDF of site to file using chrome --headless"""

    if link['type'] in ('PDF', 'image'):
        return {'output': html_appended_url(link)}

    if os.path.exists(os.path.join(link_dir, 'output.pdf')):
        return {'output': 'output.pdf', 'status': 'skipped'}

    CMD = [
        *chrome_headless(user_data_dir=user_data_dir),
        '--print-to-pdf',
        link['url']
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1)  # output.pdf
        end()
        if result.returncode:
            print('     ', (result.stderr or result.stdout).decode())
            raise Exception('Failed to print PDF')
        output = 'output.pdf'
    except Exception as e:
        end()
        print('       Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
        print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
        output = e

    return {
        'cmd': CMD,
        'output': output,
    }