def watch_crawl(crawl_id): """ Watch crawling browsers in local browser :param crawl_id: list of crawl ids to watch """ for id_ in crawl_id: res = sesh_get('/crawl/{0}'.format(id_)) if res.get('headless'): if not is_quiet(): print("Can not watch, crawl is running in headless mode") continue if res.get('status') != 'running': if not is_quiet(): print('Crawl not running: {0}'.format(id_)) continue browsers = res['browsers'] done_count = defaultdict(int) for info in res.get('tabs_done'): done_count[info['id']] += 1 if not browsers: if not is_quiet(): print('No Browsers') continue open_browsers(browsers, id_, done_count, res['num_tabs'])
def create_profile(browser): res = sesh_get('/api/request/{0}/about:blank'.format(browser), prefix=settings.shepherd_prefix) reqid = res.get('reqid') curr_browser = None webbrowser.open(settings.view_browsers_prefix + reqid) print('A new browser window should have been opened') print( 'You can use the browser to log-in to accounts or otherwise prepare the browser profile' ) print('(The content will not be recorded to WARC)') while True: profile_name = click.prompt( 'When done, please enter a new name to save the browser profile', type=str) if not curr_browser: curr_browser = docker_api.containers.get('browser-' + reqid) # exit_code, output = curr_browser.exec_run('/app/prep-commit.sh') exit_code, output = curr_browser.exec_run( 'pkill -f "/usr/bin/google-chrome"') if not is_quiet(): print('Killed Chrome to Save Profile for Commit') print('Result: {0}'.format(exit_code)) print(output.decode('utf-8')) time.sleep(1.5) conf = { 'Labels': { LABEL_BROWSERPROFILE: profile_name, LABEL_BASEBROWSER: browser } } res = curr_browser.commit( repository=PROFILE_PREFIX[:-1], tag=profile_name, message='Browser Profile', conf=conf, ) if not is_quiet(): print('Created Image: {0} ({1})'.format(res.tags[0], res.short_id)) print('The browser should have restarted to about:blank') if not click.confirm('Continue browsing to create another profile?'): break
def remove_profile(profile): full_tag = PROFILE_PREFIX + profile try: docker_api.images.remove(full_tag, force=True, noprune=False) if not is_quiet(): print('Removed profile "{0}"!'.format(profile)) except docker.errors.ImageNotFound: if not is_quiet(): print('Profile "{0}" not found'.format(profile)) sys.exit(1)
def list_crawls(): """ List all available crawls """ res = sesh_get('/crawls') sorted_list = sorted(res['crawls'], key=lambda x: x['start_time'], reverse=True) if is_quiet(): for crawl in sorted_list: print(crawl['id']) return format_str = '{value: <{size}} ' for _, text, size in COLUMNS: sys.stdout.write(format_str.format(value=text, size=size)) print() for crawl in sorted_list: for field, _, size in COLUMNS: value = crawl[field] if field == 'start_time': value = format_duration(value, None) + ' ago' elif field == 'finish_time': value = format_duration(crawl['start_time'], value) sys.stdout.write(format_str.format(value=value, size=size)) print() print()
def remove_all(): """ Stop and remove all crawls """ res = sesh_get('/crawls') crawls = res['crawls'] for crawl in crawls: id_ = crawl['id'] res = sesh_delete('/crawl/{0}'.format(id_)) if not is_quiet(): print('Removed Crawl: {0}'.format(id_))
def start_crawl(crawl_id, browser, headless, behavior_time): """ Start an existing crawl :param crawl_id: list of crawl ids to start """ for id_ in crawl_id: res = sesh_post('/crawl/{0}/start'.format(id_)) if is_quiet(): print(res['id']) else: print('Started Crawl: {0}'.format(res['id']))
def list_profiles(): res = docker_api.images.list(filters={'label': LABEL_BROWSERPROFILE}) format_str = '{profile: <16} {base}' if not is_quiet(): print(format_str.format(profile='PROFILE', base='BASE BROWSER')) for image in res: if not image.tags: continue if not image.tags[0].startswith(PROFILE_PREFIX): continue profile = image.tags[0][len(PROFILE_PREFIX):] base_browser = image.labels.get(LABEL_BASEBROWSER, '(unknown)') if not is_quiet(): print(format_str.format(profile=profile, base=base_browser)) else: print(profile) if not is_quiet(): print()
def get_profile_image(profile): try: global docker_api if not docker_api: docker_api = docker.from_env(version='auto') image_name = PROFILE_PREFIX + profile image = docker_api.images.get(image_name) assert image.labels.get(LABEL_BROWSERPROFILE) == profile return 'profile:' + profile except (docker.errors.ImageNotFound, AssertionError): if not is_quiet(): print('Profile "{0}" not found'.format(profile)) sys.exit(1)
def remove_crawl(crawl_id): """ Remove one or more existing crawls :param crawl_id: list of crawl ids to stop """ for id_ in crawl_id: res = sesh_delete('/crawl/{0}'.format(id_)) if not res.get('success'): print('Error removing: ' + res) return if is_quiet(): print(id_) else: print('Removed Crawl: {0}'.format(id_))
def stop_crawl(crawl_id): """ Stop one or more existing crawls :param crawl_id: list of crawl ids to stop """ for id_ in crawl_id: res = sesh_post('/crawl/{0}/stop'.format(id_)) if not res.get('success'): print('Error stopping: ' + res) return if is_quiet(): print(id_) else: print('Stopped Crawl: {0}'.format(id_))
def open_browsers(browsers, crawl_id, tabs_done=None, num_tabs=-1): count = 1 for reqid in browsers: skip = False if not tabs_done or tabs_done.get(reqid) != num_tabs: msg = 'Opening Browser {0} of {1} ({2}) for crawl {3}' else: msg = 'Skipping Finished Browser {0} of {1}, ({2}) for crawl {3}' skip = True if not is_quiet(): print(msg.format(count, len(browsers), reqid, crawl_id)) if not skip: webbrowser.open(settings.view_browsers_prefix + reqid) count += 1
def create_crawl( crawl_spec_file, start, browser, profile, coll, mode, screenshot_coll, headless, behavior_time, watch, log, ): """ Create a new crawl! :param crawl_spec_file: YAML file with one or more crawls in 'crawls' key :param start: If true, start crawl immediately after creation :param browser: Browser Docker image to use for crawling (overrides setting in spec) :param profile: Browser Profile Docker image to use for crawling (overrides "browser" setting) :param coll: Set the collection (overrides setting in spec) :param mode: Set the capture mode (overrides setting in spec) :param screenshot_coll: Set the collection to save screenshots (overrides setting in spec) :param headless: Use headless mode. Browsers can not be opened for watching the crawl :param behavior_time: Max duration (in seconds) to run each in-page behavior :param watch: Watch all started browsers in a local browser (only if starting crawl) """ root = yaml.load(crawl_spec_file, Loader=yaml.Loader) for crawl_spec in root['crawls']: if not start: msg = 'Created' else: msg = 'Created and Started' if headless is not None: crawl_spec['headless'] = headless if behavior_time is not None: crawl_spec['behavior_time'] = behavior_time if profile is not None: browser = get_profile_image(profile) if browser is not None: crawl_spec['browser'] = browser if coll is not None: crawl_spec['coll'] = coll if mode is not None: crawl_spec['mode'] = mode if screenshot_coll is not None: crawl_spec['screenshot_coll'] = screenshot_coll if not is_quiet(): print('Creating New Crawl, Please Wait...') res = sesh_post('/crawls', json=crawl_spec) if is_quiet(): print(res['id']) else: print('Crawl {0}: {1}'.format(msg, res['id'])) print('Status: {0}'.format(res['status'])) if watch: if not start: if not is_quiet(): print("Can't watch, crawl not started") elif headless: if not is_quiet(): print("Can't watch, crawl is running in headless mode") else: open_browsers(res['browsers'], res['id']) if log: print_logs(res['browsers'], follow=True, wait=True)