Esempio n. 1
0
def get_info(crawl_id, urls):
    """ Get info on existing crawl(s)

        :param crawl_id: list of crawl ids to get info on
        :param urls: Get detailed info on crawl, listing all urls
    """
    for id_ in crawl_id:
        if urls:
            res = sesh_get('/crawl/{0}/info'.format(id_))
        else:
            res = sesh_get('/crawl/{0}'.format(id_))

        print(yaml.dump(res))
Esempio n. 2
0
def watch_crawl(crawl_id):
    """ Watch crawling browsers in local browser

        :param crawl_id: list of crawl ids to watch
    """
    for id_ in crawl_id:
        res = sesh_get('/crawl/{0}'.format(id_))

        if res.get('headless'):
            if not is_quiet():
                print("Can not watch, crawl is running in headless mode")
                continue

        if res.get('status') != 'running':
            if not is_quiet():
                print('Crawl not running: {0}'.format(id_))
                continue

        browsers = res['browsers']

        done_count = defaultdict(int)

        for info in res.get('tabs_done'):
            done_count[info['id']] += 1

        if not browsers:
            if not is_quiet():
                print('No Browsers')
                continue

        open_browsers(browsers, id_, done_count, res['num_tabs'])
Esempio n. 3
0
def list_crawls():
    """ List all available crawls
    """
    res = sesh_get('/crawls')

    sorted_list = sorted(res['crawls'],
                         key=lambda x: x['start_time'],
                         reverse=True)

    if is_quiet():
        for crawl in sorted_list:
            print(crawl['id'])

        return

    format_str = '{value: <{size}}  '

    for _, text, size in COLUMNS:
        sys.stdout.write(format_str.format(value=text, size=size))
    print()

    for crawl in sorted_list:
        for field, _, size in COLUMNS:
            value = crawl[field]
            if field == 'start_time':
                value = format_duration(value, None) + ' ago'
            elif field == 'finish_time':
                value = format_duration(crawl['start_time'], value)

            sys.stdout.write(format_str.format(value=value, size=size))
        print()
    print()
Esempio n. 4
0
def create_profile(browser):
    res = sesh_get('/api/request/{0}/about:blank'.format(browser),
                   prefix=settings.shepherd_prefix)

    reqid = res.get('reqid')

    curr_browser = None

    webbrowser.open(settings.view_browsers_prefix + reqid)

    print('A new browser window should have been opened')
    print(
        'You can use the browser to log-in to accounts or otherwise prepare the browser profile'
    )
    print('(The content will not be recorded to WARC)')

    while True:
        profile_name = click.prompt(
            'When done, please enter a new name to save the browser profile',
            type=str)

        if not curr_browser:
            curr_browser = docker_api.containers.get('browser-' + reqid)

        # exit_code, output = curr_browser.exec_run('/app/prep-commit.sh')
        exit_code, output = curr_browser.exec_run(
            'pkill -f "/usr/bin/google-chrome"')
        if not is_quiet():
            print('Killed Chrome to Save Profile for Commit')
            print('Result: {0}'.format(exit_code))
            print(output.decode('utf-8'))

        time.sleep(1.5)

        conf = {
            'Labels': {
                LABEL_BROWSERPROFILE: profile_name,
                LABEL_BASEBROWSER: browser
            }
        }

        res = curr_browser.commit(
            repository=PROFILE_PREFIX[:-1],
            tag=profile_name,
            message='Browser Profile',
            conf=conf,
        )

        if not is_quiet():
            print('Created Image: {0} ({1})'.format(res.tags[0], res.short_id))

        print('The browser should have restarted to about:blank')
        if not click.confirm('Continue browsing to create another profile?'):
            break
Esempio n. 5
0
def remove_all():
    """ Stop and remove all crawls
    """
    res = sesh_get('/crawls')

    crawls = res['crawls']

    for crawl in crawls:
        id_ = crawl['id']
        res = sesh_delete('/crawl/{0}'.format(id_))
        if not is_quiet():
            print('Removed Crawl: {0}'.format(id_))
Esempio n. 6
0
def logs(crawl_id, browser, follow):
    """ View crawl logs for one or all crawlers
    :param crawl_id: The crawl_id to view logs for
    :param browser: 1-based index of browser to show logs for, or 0 for all (default)
    :param follow: follow crawl log in real-time (for one browser only)
    """
    res = sesh_get('/crawl/{0}'.format(crawl_id))

    num_browsers = len(res['browsers'])
    if browser <= 0:
        print_logs(res['browsers'], follow=follow)
    elif browser > num_browsers:
        print('Crawl has {0} browsers. Index must be 1 to {0}'.format(
            num_browsers, num_browsers))
    else:
        print_logs([res['browsers'][browser - 1]], follow=follow)