コード例 #1
0
def listing():

    _spiders = settings.get_spiders()
    _commands = settings.get_commands()

    # When the system is starting up, spiders may return empty because
    # we're using async execution `green_threads.find_new_spiders`.
    if not _spiders:
        return flask.Response(response=json.dumps({}, sort_keys=True),
                              status=200,
                              mimetype="application/json")

    _spiders.sort()

    d = OrderedDict()

    for s in _spiders:

        obj = get_job_object(key=s)

        if obj:
            d[s] = obj
        else:
            # Jobs without previous information, using default config
            d[s] = {}
            d[s]['active'] = False
            d[s]['job_type'] = 'spider'
            d[s]['min_concurrency'] = 1
            d[s]['max_concurrency'] = 5
            d[s]['max_memory_mb'] = 200
            d[s]['priority'] = 1
            d[s]['frequency_minutes'] = 60
            d[s]['start_urls'] = []
            d[s]['last_started_at'] = datetime.utcnow().isoformat()
            d[s]['next_execution_at'] = (
                datetime.utcnow() +
                timedelta(minutes=d[s]['frequency_minutes'])).isoformat()

    for file_name in _commands:

        obj = get_job_object(key=file_name)

        if obj:
            d[file_name] = obj

        else:
            d[file_name] = {}
            d[file_name]['active'] = False
            d[file_name]['job_type'] = 'command'
            d[file_name]['min_concurrency'] = 1
            d[file_name]['max_concurrency'] = 1
            d[file_name]['max_memory_mb'] = 50
            d[file_name]['priority'] = 1
            d[file_name]['frequency_minutes'] = 60
            d[file_name]['last_started_at'] = None
            d[file_name]['next_execution_at'] = None

    return flask.Response(response=json.dumps(d, sort_keys=True),
                          status=200,
                          mimetype="application/json")
コード例 #2
0
ファイル: jobs.py プロジェクト: leandroloi/scrapy-eagle
def listing():

    _spiders = settings.get_spiders()
    _commands = settings.get_commands()

    # May happen to request this route before we've
    # the settings filled by the gevent async execution `green_threads.find_new_spiders`
    if not _spiders:
        return flask.Response(response=json.dumps({}, sort_keys=True),
                              status=200,
                              mimetype="application/json")

    _spiders.sort()

    d = OrderedDict()

    for s in _spiders:

        obj = get_job_object(key=s)

        if obj:
            d[s] = obj
        else:
            # TODO: How to proceed for newly jobs
            pass

        # d[s] = {}
        # d[s]['active'] = True
        # d[s]['job_type'] = 'spider' # or 'command'
        # d[s]['min_concurrency'] = 1
        # d[s]['max_concurrency'] = 5
        # d[s]['max_memory_mb'] = 200
        # d[s]['priority'] = 7
        # d[s]['frequency_minutes'] = 60
        # d[s]['last_started_at'] = 20
        # d[s]['start_urls'] = []
        # d[s]['last_started_at'] = datetime.utcnow().isoformat()

    for file_name in _commands:

        obj = get_job_object(key=file_name)

        if obj:
            d[file_name] = obj

        else:
            d[file_name] = {}
            d[file_name]['active'] = False
            d[file_name]['job_type'] = 'command'
            d[file_name]['min_concurrency'] = 1
            d[file_name]['max_concurrency'] = 3
            d[file_name]['max_memory_mb'] = 50
            d[file_name]['priority'] = 2
            d[file_name]['frequency_minutes'] = 60
            d[file_name]['last_started_at'] = None

    return flask.Response(response=json.dumps(d, sort_keys=True),
                          status=200,
                          mimetype="application/json")
コード例 #3
0
def evaluation_loop():

    while True:

        _spiders = settings.get_spiders()
        _commands = settings.get_commands()

        # When the system is starting up, spiders/commands may return empty because
        # we're using async execution `green_threads.find_new_spiders`.
        if _spiders and _commands:

            for key in _spiders + _commands:
                obj = get_job_object(key=key)

                if obj and obj.get('next_execution_at'):

                    next_execution_at = timestamp_to_utc(
                        iso_to_timestamp(obj['next_execution_at']))

                    now = datetime.utcnow()

                    if next_execution_at < now:

                        dispatch(key=key, register=obj)

        gevent.sleep(3)
コード例 #4
0
def evaluation_loop():

    while True:

        _spiders = settings.get_spiders()
        _commands = settings.get_commands()

        # When the system is starting up, spiders/commands may return empty because
        # we're using async execution `green_threads.find_new_spiders`.
        if _spiders and _commands:

            for key in _spiders + _commands:
                obj = get_job_object(key=key)

                if obj and obj.get('next_execution_at'):

                    next_execution_at = timestamp_to_utc(iso_to_timestamp(obj['next_execution_at']))

                    now = datetime.utcnow()

                    if next_execution_at < now:

                        dispatch(key=key, register=obj)

        gevent.sleep(3)
コード例 #5
0
ファイル: jobs.py プロジェクト: leandroloi/scrapy-eagle
def update():

    #TODO: Ensure that the incoming request comes from the same IP (Security)

    result = {}
    error = False

    key, job_type, active, frequency_minutes, max_concurrency = (None, None,
                                                                 None, None,
                                                                 None)
    min_concurrency, priority, max_memory_mb, start_urls = (None, None, None,
                                                            None)

    try:

        key = flask.request.form.get('key', None)
        job_type = flask.request.form.get('job_type', None)
        frequency_minutes = int(
            flask.request.form.get('frequency_minutes', None))
        max_concurrency = int(flask.request.form.get('max_concurrency', None))
        min_concurrency = int(flask.request.form.get('min_concurrency', None))
        priority = int(flask.request.form.get('priority', None))
        max_memory_mb = int(flask.request.form.get('max_memory_mb', None))
        start_urls = flask.request.form.get('start_urls', None)

        if flask.request.form.get('active', None) == 'false':
            active = False
        elif flask.request.form.get('active', None) == 'true':
            active = True
        else:
            active = False

    # Never trust in the user input type
    except ValueError:
        error = True
        result.update({
            'status':
            'error',
            'msg':
            'You sent wrong datatypes, like a letter when it should be numeric.'
        })

    if not error:

        if not all([
                key, job_type, frequency_minutes, max_concurrency,
                min_concurrency, priority, max_memory_mb
        ]):
            error = True
            result.update({
                'status':
                'error',
                'msg':
                'You are missing some information, please check your form.'
            })

        elif not start_urls and job_type == 'spider':
            error = True
            result.update({
                'status':
                'error',
                'msg':
                'You should provide the Start URLs information for spiders.'
            })

        else:

            actual_obj = get_job_object(key=key)

            # A brand new
            if not actual_obj:
                actual_obj = {}

            actual_obj.update({
                'active': active,
                'job_type': job_type,
                'frequency_minutes': frequency_minutes,
                'max_concurrency': max_concurrency,
                'min_concurrency': min_concurrency,
                'priority': priority,
                'max_memory_mb': max_memory_mb
            })

            if job_type == 'spider':
                actual_obj.update(
                    {'start_urls': [x for x in start_urls.split("\n") if x]})

            update_job_object(key=key, fields=actual_obj)

        if not error:
            result.update({'status': 'ok'})

    return flask.Response(response=json.dumps(result, sort_keys=True),
                          status=200,
                          mimetype="application/json")
コード例 #6
0
ファイル: jobs.py プロジェクト: rafaelcapucho/scrapy-eagle
def update():

    #TODO: Ensure that the incoming request comes from the same IP (Security)

    result = {}
    error = False

    key, job_type, active, frequency_minutes, max_concurrency = (None, None, None, None, None)
    min_concurrency, priority, max_memory_mb, start_urls = (None, None, None, None)

    try:

        key = flask.request.form.get('key', None)
        job_type = flask.request.form.get('job_type', None)
        frequency_minutes = int(flask.request.form.get('frequency_minutes', None))
        max_concurrency = int(flask.request.form.get('max_concurrency', None))
        min_concurrency = int(flask.request.form.get('min_concurrency', None))
        priority = int(flask.request.form.get('priority', None))
        max_memory_mb = int(flask.request.form.get('max_memory_mb', None))
        start_urls = flask.request.form.get('start_urls', None)

        if flask.request.form.get('active', None) == 'false':
            active = False
        elif flask.request.form.get('active', None) == 'true':
            active = True
        else:
            active = False

    # Never trust in the user input type
    except ValueError:
        error = True
        result.update({
            'status': 'error',
            'msg': 'You sent wrong datatypes, like a letter when it should be numeric.'
        })

    if not error:

        if not all([key, job_type, frequency_minutes, max_concurrency, min_concurrency, priority, max_memory_mb]):
            error = True
            result.update({
                'status': 'error',
                'msg': 'You are missing some information, please check your form.'
            })

        elif not start_urls and job_type == 'spider':
            error = True
            result.update({
                'status': 'error',
                'msg': 'You should provide the Start URLs information for spiders.'
            })

        else:

            actual_obj = get_job_object(key=key)

            # A brand new
            if not actual_obj:
                actual_obj = {}
            else:
                current_frequency = actual_obj['frequency_minutes']

            actual_obj.update({
                'active': active,
                'job_type': job_type,
                'frequency_minutes': frequency_minutes,
                'max_concurrency': max_concurrency,
                'min_concurrency': min_concurrency,
                'priority': priority,
                'max_memory_mb': max_memory_mb
            })

            # If the frequency change, recalculate the next execution
            if current_frequency != frequency_minutes:
                actual_obj['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=frequency_minutes)).isoformat()

            if job_type == 'spider':
                actual_obj.update({'start_urls': [x for x in start_urls.split("\n") if x]})

            update_job_object(key=key, fields=actual_obj)

        if not error:
            result.update({
                'status': 'ok'
            })

    return flask.Response(
        response=json.dumps(result, sort_keys=True),
        status=200,
        mimetype="application/json"
    )
コード例 #7
0
ファイル: jobs.py プロジェクト: rafaelcapucho/scrapy-eagle
def listing():

    _spiders = settings.get_spiders()
    _commands = settings.get_commands()

    # When the system is starting up, spiders may return empty because
    # we're using async execution `green_threads.find_new_spiders`.
    if not _spiders:
        return flask.Response(
            response=json.dumps({}, sort_keys=True),
            status=200,
            mimetype="application/json"
        )

    _spiders.sort()

    d = OrderedDict()

    for s in _spiders:

        obj = get_job_object(key=s)

        if obj:
            d[s] = obj
        else:
            # Jobs without previous information, using default config
            d[s] = {}
            d[s]['active'] = False
            d[s]['job_type'] = 'spider'
            d[s]['min_concurrency'] = 1
            d[s]['max_concurrency'] = 5
            d[s]['max_memory_mb'] = 200
            d[s]['priority'] = 1
            d[s]['frequency_minutes'] = 60
            d[s]['start_urls'] = []
            d[s]['last_started_at'] = datetime.utcnow().isoformat()
            d[s]['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=d[s]['frequency_minutes'])).isoformat()

    for file_name in _commands:

        obj = get_job_object(key=file_name)

        if obj:
            d[file_name] = obj

        else:
            d[file_name] = {}
            d[file_name]['active'] = False
            d[file_name]['job_type'] = 'command'
            d[file_name]['min_concurrency'] = 1
            d[file_name]['max_concurrency'] = 1
            d[file_name]['max_memory_mb'] = 50
            d[file_name]['priority'] = 1
            d[file_name]['frequency_minutes'] = 60
            d[file_name]['last_started_at'] = None
            d[file_name]['next_execution_at'] = None

    return flask.Response(
        response=json.dumps(d, sort_keys=True),
        status=200,
        mimetype="application/json"
    )