コード例 #1
0
def evaluation_loop():

    while True:

        _spiders = settings.get_spiders()
        _commands = settings.get_commands()

        # When the system is starting up, spiders/commands may return empty because
        # we're using async execution `green_threads.find_new_spiders`.
        if _spiders and _commands:

            for key in _spiders + _commands:
                obj = get_job_object(key=key)

                if obj and obj.get('next_execution_at'):

                    next_execution_at = timestamp_to_utc(
                        iso_to_timestamp(obj['next_execution_at']))

                    now = datetime.utcnow()

                    if next_execution_at < now:

                        dispatch(key=key, register=obj)

        gevent.sleep(3)
コード例 #2
0
def evaluation_loop():

    while True:

        _spiders = settings.get_spiders()
        _commands = settings.get_commands()

        # When the system is starting up, spiders/commands may return empty because
        # we're using async execution `green_threads.find_new_spiders`.
        if _spiders and _commands:

            for key in _spiders + _commands:
                obj = get_job_object(key=key)

                if obj and obj.get('next_execution_at'):

                    next_execution_at = timestamp_to_utc(iso_to_timestamp(obj['next_execution_at']))

                    now = datetime.utcnow()

                    if next_execution_at < now:

                        dispatch(key=key, register=obj)

        gevent.sleep(3)
コード例 #3
0
def listing():

    _spiders = settings.get_spiders()
    _commands = settings.get_commands()

    # When the system is starting up, spiders may return empty because
    # we're using async execution `green_threads.find_new_spiders`.
    if not _spiders:
        return flask.Response(response=json.dumps({}, sort_keys=True),
                              status=200,
                              mimetype="application/json")

    _spiders.sort()

    d = OrderedDict()

    for s in _spiders:

        obj = get_job_object(key=s)

        if obj:
            d[s] = obj
        else:
            # Jobs without previous information, using default config
            d[s] = {}
            d[s]['active'] = False
            d[s]['job_type'] = 'spider'
            d[s]['min_concurrency'] = 1
            d[s]['max_concurrency'] = 5
            d[s]['max_memory_mb'] = 200
            d[s]['priority'] = 1
            d[s]['frequency_minutes'] = 60
            d[s]['start_urls'] = []
            d[s]['last_started_at'] = datetime.utcnow().isoformat()
            d[s]['next_execution_at'] = (
                datetime.utcnow() +
                timedelta(minutes=d[s]['frequency_minutes'])).isoformat()

    for file_name in _commands:

        obj = get_job_object(key=file_name)

        if obj:
            d[file_name] = obj

        else:
            d[file_name] = {}
            d[file_name]['active'] = False
            d[file_name]['job_type'] = 'command'
            d[file_name]['min_concurrency'] = 1
            d[file_name]['max_concurrency'] = 1
            d[file_name]['max_memory_mb'] = 50
            d[file_name]['priority'] = 1
            d[file_name]['frequency_minutes'] = 60
            d[file_name]['last_started_at'] = None
            d[file_name]['next_execution_at'] = None

    return flask.Response(response=json.dumps(d, sort_keys=True),
                          status=200,
                          mimetype="application/json")
コード例 #4
0
ファイル: jobs.py プロジェクト: leandroloi/scrapy-eagle
def listing():

    _spiders = settings.get_spiders()
    _commands = settings.get_commands()

    # May happen to request this route before we've
    # the settings filled by the gevent async execution `green_threads.find_new_spiders`
    if not _spiders:
        return flask.Response(response=json.dumps({}, sort_keys=True),
                              status=200,
                              mimetype="application/json")

    _spiders.sort()

    d = OrderedDict()

    for s in _spiders:

        obj = get_job_object(key=s)

        if obj:
            d[s] = obj
        else:
            # TODO: How to proceed for newly jobs
            pass

        # d[s] = {}
        # d[s]['active'] = True
        # d[s]['job_type'] = 'spider' # or 'command'
        # d[s]['min_concurrency'] = 1
        # d[s]['max_concurrency'] = 5
        # d[s]['max_memory_mb'] = 200
        # d[s]['priority'] = 7
        # d[s]['frequency_minutes'] = 60
        # d[s]['last_started_at'] = 20
        # d[s]['start_urls'] = []
        # d[s]['last_started_at'] = datetime.utcnow().isoformat()

    for file_name in _commands:

        obj = get_job_object(key=file_name)

        if obj:
            d[file_name] = obj

        else:
            d[file_name] = {}
            d[file_name]['active'] = False
            d[file_name]['job_type'] = 'command'
            d[file_name]['min_concurrency'] = 1
            d[file_name]['max_concurrency'] = 3
            d[file_name]['max_memory_mb'] = 50
            d[file_name]['priority'] = 2
            d[file_name]['frequency_minutes'] = 60
            d[file_name]['last_started_at'] = None

    return flask.Response(response=json.dumps(d, sort_keys=True),
                          status=200,
                          mimetype="application/json")
コード例 #5
0
def send_resources_info(socketio, subprocess_pids, public_ip):

    while True:

        dict_info_pid_greenlet = gevent.spawn(get_resources_info_from_pid)
        dict_info_host_greenlet = gevent.spawn(get_resources_info_from_server)

        subprocess_info_greenlets = []

        for pid, spider, command, base_dir, created_at in subprocess_pids:

            # We pass all the parameters that we like to keep instead
            # of simply use a .update() here because the returned instance
            # is a Greenlet instead of a dict.

            info_greenlet = gevent.spawn(
                get_resources_info_from_pid,
                pid=pid,
                spider=spider,
                command=command,
                base_dir=base_dir,
                created_at=created_at,
            )

            subprocess_info_greenlets.append(info_greenlet)

        dict_info_pid_greenlet.join()
        dict_info = dict_info_pid_greenlet.get()
        dict_info['public_ip'] = public_ip

        dict_info_host_greenlet.join()
        dict_info_host = dict_info_host_greenlet.get()
        dict_info.update(dict_info_host)

        gevent.joinall(subprocess_info_greenlets)
        dict_info['sub'] = [
            greenlet.get() for greenlet in subprocess_info_greenlets
        ]

        # When get_resources_info try to access a PID that dont exists any more it
        # return None, here we remove those results. It happen because it takes
        # sometime to subprocess_pids remove PIDs that finishs.
        dict_info['sub'] = [x for x in dict_info['sub'] if x]

        _spiders = settings.get_spiders()

        dict_info['spiders'] = _spiders or []

        print('\n\ndict_info: ', dict_info, '\n\n')

        socketio.emit('resources_info', {'data': dict_info},
                      namespace="/resources",
                      broadcast=True)

        gevent.sleep(1)
コード例 #6
0
ファイル: stats.py プロジェクト: rafaelcapucho/scrapy-eagle
def send_resources_info(socketio, subprocess_pids, public_ip):

    while True:

        dict_info_pid_greenlet = gevent.spawn(get_resources_info_from_pid)
        dict_info_host_greenlet = gevent.spawn(get_resources_info_from_server)

        subprocess_info_greenlets = []

        for pid, spider, command, base_dir, created_at in subprocess_pids:

            # We pass all the parameters that we like to keep instead
            # of simply use a .update() here because the returned instance
            # is a Greenlet instead of a dict.

            info_greenlet = gevent.spawn(
                get_resources_info_from_pid,
                pid=pid,
                spider=spider,
                command=command,
                base_dir=base_dir,
                created_at=created_at,
            )

            subprocess_info_greenlets.append(info_greenlet)

        dict_info_pid_greenlet.join()
        dict_info = dict_info_pid_greenlet.get()
        dict_info['public_ip'] = public_ip

        dict_info_host_greenlet.join()
        dict_info_host = dict_info_host_greenlet.get()
        dict_info.update(dict_info_host)

        gevent.joinall(subprocess_info_greenlets)
        dict_info['sub'] = [greenlet.get() for greenlet in subprocess_info_greenlets]

        # When get_resources_info try to access a PID that dont exists any more it
        # return None, here we remove those results. It happen because it takes
        # sometime to subprocess_pids remove PIDs that finishs.
        dict_info['sub'] = [x for x in dict_info['sub'] if x]

        _spiders = settings.get_spiders()
        _commands = settings.get_commands()

        dict_info['spiders'] = _spiders or []
        dict_info['commands'] = _commands or []

        print('\n\ndict_info: ', dict_info, '\n\n')

        socketio.emit('resources_info', {'data': dict_info}, namespace="/resources", broadcast=True)

        gevent.sleep(1)
コード例 #7
0
ファイル: jobs.py プロジェクト: rafaelcapucho/scrapy-eagle
def listing():

    _spiders = settings.get_spiders()
    _commands = settings.get_commands()

    # When the system is starting up, spiders may return empty because
    # we're using async execution `green_threads.find_new_spiders`.
    if not _spiders:
        return flask.Response(
            response=json.dumps({}, sort_keys=True),
            status=200,
            mimetype="application/json"
        )

    _spiders.sort()

    d = OrderedDict()

    for s in _spiders:

        obj = get_job_object(key=s)

        if obj:
            d[s] = obj
        else:
            # Jobs without previous information, using default config
            d[s] = {}
            d[s]['active'] = False
            d[s]['job_type'] = 'spider'
            d[s]['min_concurrency'] = 1
            d[s]['max_concurrency'] = 5
            d[s]['max_memory_mb'] = 200
            d[s]['priority'] = 1
            d[s]['frequency_minutes'] = 60
            d[s]['start_urls'] = []
            d[s]['last_started_at'] = datetime.utcnow().isoformat()
            d[s]['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=d[s]['frequency_minutes'])).isoformat()

    for file_name in _commands:

        obj = get_job_object(key=file_name)

        if obj:
            d[file_name] = obj

        else:
            d[file_name] = {}
            d[file_name]['active'] = False
            d[file_name]['job_type'] = 'command'
            d[file_name]['min_concurrency'] = 1
            d[file_name]['max_concurrency'] = 1
            d[file_name]['max_memory_mb'] = 50
            d[file_name]['priority'] = 1
            d[file_name]['frequency_minutes'] = 60
            d[file_name]['last_started_at'] = None
            d[file_name]['next_execution_at'] = None

    return flask.Response(
        response=json.dumps(d, sort_keys=True),
        status=200,
        mimetype="application/json"
    )