def evaluation_loop(): while True: _spiders = settings.get_spiders() _commands = settings.get_commands() # When the system is starting up, spiders/commands may return empty because # we're using async execution `green_threads.find_new_spiders`. if _spiders and _commands: for key in _spiders + _commands: obj = get_job_object(key=key) if obj and obj.get('next_execution_at'): next_execution_at = timestamp_to_utc( iso_to_timestamp(obj['next_execution_at'])) now = datetime.utcnow() if next_execution_at < now: dispatch(key=key, register=obj) gevent.sleep(3)
def evaluation_loop(): while True: _spiders = settings.get_spiders() _commands = settings.get_commands() # When the system is starting up, spiders/commands may return empty because # we're using async execution `green_threads.find_new_spiders`. if _spiders and _commands: for key in _spiders + _commands: obj = get_job_object(key=key) if obj and obj.get('next_execution_at'): next_execution_at = timestamp_to_utc(iso_to_timestamp(obj['next_execution_at'])) now = datetime.utcnow() if next_execution_at < now: dispatch(key=key, register=obj) gevent.sleep(3)
def listing(): _spiders = settings.get_spiders() _commands = settings.get_commands() # When the system is starting up, spiders may return empty because # we're using async execution `green_threads.find_new_spiders`. if not _spiders: return flask.Response(response=json.dumps({}, sort_keys=True), status=200, mimetype="application/json") _spiders.sort() d = OrderedDict() for s in _spiders: obj = get_job_object(key=s) if obj: d[s] = obj else: # Jobs without previous information, using default config d[s] = {} d[s]['active'] = False d[s]['job_type'] = 'spider' d[s]['min_concurrency'] = 1 d[s]['max_concurrency'] = 5 d[s]['max_memory_mb'] = 200 d[s]['priority'] = 1 d[s]['frequency_minutes'] = 60 d[s]['start_urls'] = [] d[s]['last_started_at'] = datetime.utcnow().isoformat() d[s]['next_execution_at'] = ( datetime.utcnow() + timedelta(minutes=d[s]['frequency_minutes'])).isoformat() for file_name in _commands: obj = get_job_object(key=file_name) if obj: d[file_name] = obj else: d[file_name] = {} d[file_name]['active'] = False d[file_name]['job_type'] = 'command' d[file_name]['min_concurrency'] = 1 d[file_name]['max_concurrency'] = 1 d[file_name]['max_memory_mb'] = 50 d[file_name]['priority'] = 1 d[file_name]['frequency_minutes'] = 60 d[file_name]['last_started_at'] = None d[file_name]['next_execution_at'] = None return flask.Response(response=json.dumps(d, sort_keys=True), status=200, mimetype="application/json")
def listing(): _spiders = settings.get_spiders() _commands = settings.get_commands() # May happen to request this route before we've # the settings filled by the gevent async execution `green_threads.find_new_spiders` if not _spiders: return flask.Response(response=json.dumps({}, sort_keys=True), status=200, mimetype="application/json") _spiders.sort() d = OrderedDict() for s in _spiders: obj = get_job_object(key=s) if obj: d[s] = obj else: # TODO: How to proceed for newly jobs pass # d[s] = {} # d[s]['active'] = True # d[s]['job_type'] = 'spider' # or 'command' # d[s]['min_concurrency'] = 1 # d[s]['max_concurrency'] = 5 # d[s]['max_memory_mb'] = 200 # d[s]['priority'] = 7 # d[s]['frequency_minutes'] = 60 # d[s]['last_started_at'] = 20 # d[s]['start_urls'] = [] # d[s]['last_started_at'] = datetime.utcnow().isoformat() for file_name in _commands: obj = get_job_object(key=file_name) if obj: d[file_name] = obj else: d[file_name] = {} d[file_name]['active'] = False d[file_name]['job_type'] = 'command' d[file_name]['min_concurrency'] = 1 d[file_name]['max_concurrency'] = 3 d[file_name]['max_memory_mb'] = 50 d[file_name]['priority'] = 2 d[file_name]['frequency_minutes'] = 60 d[file_name]['last_started_at'] = None return flask.Response(response=json.dumps(d, sort_keys=True), status=200, mimetype="application/json")
def send_resources_info(socketio, subprocess_pids, public_ip): while True: dict_info_pid_greenlet = gevent.spawn(get_resources_info_from_pid) dict_info_host_greenlet = gevent.spawn(get_resources_info_from_server) subprocess_info_greenlets = [] for pid, spider, command, base_dir, created_at in subprocess_pids: # We pass all the parameters that we like to keep instead # of simply use a .update() here because the returned instance # is a Greenlet instead of a dict. info_greenlet = gevent.spawn( get_resources_info_from_pid, pid=pid, spider=spider, command=command, base_dir=base_dir, created_at=created_at, ) subprocess_info_greenlets.append(info_greenlet) dict_info_pid_greenlet.join() dict_info = dict_info_pid_greenlet.get() dict_info['public_ip'] = public_ip dict_info_host_greenlet.join() dict_info_host = dict_info_host_greenlet.get() dict_info.update(dict_info_host) gevent.joinall(subprocess_info_greenlets) dict_info['sub'] = [ greenlet.get() for greenlet in subprocess_info_greenlets ] # When get_resources_info try to access a PID that dont exists any more it # return None, here we remove those results. It happen because it takes # sometime to subprocess_pids remove PIDs that finishs. dict_info['sub'] = [x for x in dict_info['sub'] if x] _spiders = settings.get_spiders() dict_info['spiders'] = _spiders or [] print('\n\ndict_info: ', dict_info, '\n\n') socketio.emit('resources_info', {'data': dict_info}, namespace="/resources", broadcast=True) gevent.sleep(1)
def send_resources_info(socketio, subprocess_pids, public_ip): while True: dict_info_pid_greenlet = gevent.spawn(get_resources_info_from_pid) dict_info_host_greenlet = gevent.spawn(get_resources_info_from_server) subprocess_info_greenlets = [] for pid, spider, command, base_dir, created_at in subprocess_pids: # We pass all the parameters that we like to keep instead # of simply use a .update() here because the returned instance # is a Greenlet instead of a dict. info_greenlet = gevent.spawn( get_resources_info_from_pid, pid=pid, spider=spider, command=command, base_dir=base_dir, created_at=created_at, ) subprocess_info_greenlets.append(info_greenlet) dict_info_pid_greenlet.join() dict_info = dict_info_pid_greenlet.get() dict_info['public_ip'] = public_ip dict_info_host_greenlet.join() dict_info_host = dict_info_host_greenlet.get() dict_info.update(dict_info_host) gevent.joinall(subprocess_info_greenlets) dict_info['sub'] = [greenlet.get() for greenlet in subprocess_info_greenlets] # When get_resources_info try to access a PID that dont exists any more it # return None, here we remove those results. It happen because it takes # sometime to subprocess_pids remove PIDs that finishs. dict_info['sub'] = [x for x in dict_info['sub'] if x] _spiders = settings.get_spiders() _commands = settings.get_commands() dict_info['spiders'] = _spiders or [] dict_info['commands'] = _commands or [] print('\n\ndict_info: ', dict_info, '\n\n') socketio.emit('resources_info', {'data': dict_info}, namespace="/resources", broadcast=True) gevent.sleep(1)
def listing(): _spiders = settings.get_spiders() _commands = settings.get_commands() # When the system is starting up, spiders may return empty because # we're using async execution `green_threads.find_new_spiders`. if not _spiders: return flask.Response( response=json.dumps({}, sort_keys=True), status=200, mimetype="application/json" ) _spiders.sort() d = OrderedDict() for s in _spiders: obj = get_job_object(key=s) if obj: d[s] = obj else: # Jobs without previous information, using default config d[s] = {} d[s]['active'] = False d[s]['job_type'] = 'spider' d[s]['min_concurrency'] = 1 d[s]['max_concurrency'] = 5 d[s]['max_memory_mb'] = 200 d[s]['priority'] = 1 d[s]['frequency_minutes'] = 60 d[s]['start_urls'] = [] d[s]['last_started_at'] = datetime.utcnow().isoformat() d[s]['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=d[s]['frequency_minutes'])).isoformat() for file_name in _commands: obj = get_job_object(key=file_name) if obj: d[file_name] = obj else: d[file_name] = {} d[file_name]['active'] = False d[file_name]['job_type'] = 'command' d[file_name]['min_concurrency'] = 1 d[file_name]['max_concurrency'] = 1 d[file_name]['max_memory_mb'] = 50 d[file_name]['priority'] = 1 d[file_name]['frequency_minutes'] = 60 d[file_name]['last_started_at'] = None d[file_name]['next_execution_at'] = None return flask.Response( response=json.dumps(d, sort_keys=True), status=200, mimetype="application/json" )