Ejemplo n.º 1
0
 def initialize(self):
     super(SaveLogTask, self).initialize()
     # self.logger = get_task_logger(self.name)
     pagedb = get_connection('page')
     logdb = get_connection('log')
     taskdb = get_connection('task')
     self._task_coll = taskdb['task_infos']
     self._page_coll = pagedb['page_infos']
     self._log_coll = logdb['task_logs']
     self._log_coll.create_index([('created', -1), ('task_id', 1)])
     self._page_coll.create_index([('task_id', 1)])
     self._task_coll.create_index([('received', -1), ('task_id', 1)])
Ejemplo n.º 2
0
def on_worker_ready(sender, signal, **kwargs):
    # redis_conf = redis.from_url(app.conf.redis['conf'])
    from yunduo.resource import get_connection

    redis_run = get_connection('run')
    kw = {
        'exchange': 'xcrawl',
        'reply': True,
        'binding_key': None,
        'exchange_type': 'direct',
        'queue_arguments': {
            'x-max-priority': 10
        },
        'consumer_arguments': {
            'x-priority': 8
        }
    }
    # for p, s in redis_run.zscan_iter('queue:running'):
    running = redis_run.hgetall('queue:running')
    for q, s in running.items():
        if bytes_to_str(s) == 'pause':
            continue
        logger.info('regain consuming %s %s', q, s)
        q1 = bytes_to_str(q)
        kkw = kw.copy()
        kkw['binding_key'] = q1.split(':')[3]
        sender.add_task_queue(q1, **kw)
Ejemplo n.º 3
0
 def initialize(self):
     super(CrawlTask, self).initialize()
     # self.logger = get_task_logger(self.name)
     self.df = Dupefilter()
     # self.exchange = Exchange('xcrawl', type='direct')
     # self.HTTP_MAX_RETRIES = conf.get_http('max_retries')
     self.influx_stat = get_connection('stat')
     self.http_max_retries = xconf.get_http('max_retries')
     self.http_retry_codes = xconf.get_http('retry_codes')
Ejemplo n.º 4
0
    def default_save_result(self, data, kwargs):
        project = kwargs['project']
        page = kwargs['page']
        _meta = {
            'project': project,
            'job': kwargs.get('job'),
            'page': page,
            'batch_id': kwargs.get('batch_id'),
            'created': kwargs.get('created', datetime.now()),
        }

        db = get_connection('result')
        col = db['%s_%s' % (project, page)]
        if isinstance(data, dict):
            data['_meta'] = _meta
            col.insert_one(data)
            return 1
        else:
            for it in data:
                it['_meta'] = _meta
            col.insert_many(data)
            return len(data)
Ejemplo n.º 5
0
from wtforms import form
from flask import (request, redirect, flash)

from flask_admin import expose
from flask_admin.model import template
from flask_admin.model.helpers import get_mdict_item_or_list
from flask_admin.helpers import get_redirect_target
from flask_admin.babel import gettext
from flask_admin.contrib.pymongo import ModelView, filters
# from connections import mongo_log
from yunduo.resource import get_connection
from xadmin.utils.format import date_format, map_format
from xadmin.view.base import MongoView


mongo_logs = get_connection('log')
PageDB = get_connection('page')


def message_format(view, context, model, name):
    if 'exception' in model:
        exc = model['exception']
        s = '<div data-toggle="popover" data-trigger="hover" title="%s" data-content="%s">%s</a>' \
            % (cgi.escape(exc['message'], True), cgi.escape('<pre>%s</pre>' % exc['stackTrace'], True), model[name])
        return Markup(s)
    else:
        return model[name]


class TaskView(MongoView):
Ejemplo n.º 6
0
from yunduo.parser.htmlextractor import Link, Extractor, ItemResult
# from conf.common import HTTP_MAX_RETRIES, HTTP_RETRY_CODE
from yunduo.conf import xconf
# from connections import redis_conf, influx_stat, mongo_page
from yunduo.utils import merge, arg_to_iter
from yunduo.code import compile, get_function, get_script
from yunduo.downloader import download, proxy as proxy_mod
# from connections import get_connection
from yunduo.resource import get_connection
from xspider.jobaction import JobAction
from xspider.app import app
# from xspider.log import get_task_logger
# from xspider.job import gen_queue_name
from .base import StrategyTask

redis_run = get_connection('run')
# 'queue', 'routing_key', 'exchange', 'priority', 'expires',
# 'serializer', 'delivery_mode', 'compression', 'time_limit',

class CrawlTask(StrategyTask):
    # 定义自己的策略
    # Strategy = 'xspider.strategy:default'
    # rate_limit = True
    # counter = True
    # store_info = True
    # custom_queue = True

    # def __init__(self):
    #     super(CrawlTask, self).__init__()
    #     self.df = Dupefilter()
    #     self._exchange = Exchange('xcrawl', type='direct')
Ejemplo n.º 7
0
 def initialize(self):
     # super(SaveResultTask, self).initialize()
     # self.logger = get_task_logger(self.name)
     self.logger = get_task_logger(self.name, save=True)
     self.df = Dupefilter()
     self.influx_stat = get_connection('stat')
Ejemplo n.º 8
0
class Store(object):
    _redis = get_connection('conf')
    entries_key = 'beat_entries'
    schedule_key = 'schedule_entries'

    def __init__(self, lock_ttl=None):
        self.lock_ttl = lock_ttl
        self.lock = self._redis.lock('beat_lock', lock_ttl)

    def __getitem__(self, key):
        data = self._redis.hget(self.entries_key, key)
        if not data:
            raise KeyError()
        return deserialize_entry(json.loads(data), ScheduleEntry)

    def __setitem__(self, key, entry):
        # print('__setitem__', key, entry)
        if entry:
            self.add(entry)
        else:
            self.remove(key)
        # next_time = entry.next_time()
        # if next_time is None:
        #     score = -1
        # else:
        #     score = next_time.timestamp()
        # self._redis.zadd(self.schedule_key, entry.name, score)
        # self._redis.hset(self.entries_key, entry.name, serialize_entry(entry))

    def __iter__(self):
        if self.lock.acquire(False):
            try:
                max_score = time.time()
                keys = self._redis.zrangebyscore(self.schedule_key, 0,
                                                 max_score)
                for key in keys:
                    yield self[key]
                key = self._redis.zrange(self.schedule_key, 0, 1)
                if key:
                    yield self[key[0]]
            finally:
                try:
                    self.lock.release()
                except Exception as e:
                    logger.exception('release lock')
        else:
            yield

    def get(self, key, default=None):
        try:
            return self[key]
        except KeyError:
            return default

    def update(self, key, value=None):
        if isinstance(key, dict) and value is None:
            for k in key:
                self[k] = key[k]
        else:
            self[key] = value

    @classmethod
    def add(cls, entry):
        if not isinstance(entry, ScheduleEntry):
            entry = ScheduleEntry(**entry)
        next_time = entry.next_time()
        if next_time is None:
            score = float("inf")
        else:
            score = next_time.timestamp()

        with cls._redis.pipeline() as pipe:
            pipe.zadd(cls.schedule_key, entry.name, score)
            pipe.hset(cls.entries_key, entry.name,
                      json.dumps(serialize_entry(entry)))
            pipe.execute()

    @classmethod
    def remove(cls, entry):
        if isinstance(entry, ScheduleEntry):
            key = entry.name
        else:
            key = entry
        with cls._redis.pipeline() as pipe:
            pipe.zrem(cls.schedule_key, key)
            pipe.hdel(cls.entries_key, key)
            pipe.execute()
Ejemplo n.º 9
0
        m = self._resolve_symbol(context, 'row_actions.link')
        get_url = self._resolve_symbol(context, 'get_url')
        meta = row.get('meta')
        if not meta or self.id_field not in row:
            return ''

        kwargs = dict(self.url_args) if self.url_args else {}
        kwargs[self.id_arg] = row[self.id_field]

        view = context['admin_view']
        url = get_url(self.endpoint, **kwargs)

        return m(self, url)


page_db = get_connection('page')
log_db = get_connection('log')
task_db = get_connection('task')
log_task_coll = task_db['task_infos']
log_page_coll = page_db['page_infos']
log_log_coll = log_db['task_logs']


class LogView(MongoView):

    collection = log_log_coll

    column_list = (
        'meta.project',
        'meta.job',
        'meta.page',
Ejemplo n.º 10
0
 def __init__(self, *args, **kwargs):
     super(StatView, self).__init__(*args, **kwargs)
     self.influx_stat = get_connection('stat')
Ejemplo n.º 11
0
def default(task,
            app,
            consumer,
            info=logger.info,
            error=logger.error,
            task_reserved=task_reserved,
            to_system_tz=timezone.to_system,
            bytes=bytes,
            buffer_t=buffer_t):
    """Default task execution strategy.

    Note:
        Strategies are here as an optimization, so sadly
        it's not very easy to override.
    """
    redis_run = get_connection('run')

    hostname = consumer.hostname
    connection_errors = consumer.connection_errors
    _does_info = logger.isEnabledFor(logging.INFO)

    # task event related
    # (optimized to avoid calling request.send_event)
    eventer = consumer.event_dispatcher
    events = eventer and eventer.enabled
    send_event = eventer.send
    task_sends_events = events and task.send_events

    call_at = consumer.timer.call_at
    apply_eta_task = consumer.apply_eta_task
    rate_limits_enabled = not consumer.disable_rate_limits
    get_bucket = consumer.task_buckets.__getitem__
    handle = consumer.on_task_request
    limit_task = consumer._limit_task
    body_can_be_buffer = consumer.pool.body_can_be_buffer

    # Req = create_request_cls(Request, task, consumer.pool, hostname, eventer)
    # def create_request_cls(base, task, pool, hostname, eventer,
    #                        ref=ref, revoked_tasks=revoked_tasks,
    #                        task_ready=task_ready, trace=trace_task_ret):
    # default_time_limit = task.time_limit
    # default_soft_time_limit = task.soft_time_limit
    # apply_async = pool.apply_async
    # acks_late = task.acks_late
    # events = eventer and eventer.enabled
    # task_ready = state.task_ready
    # task_accepted = state.task_accepted
    task_ready = state.task_ready
    revoked_tasks = state.revoked

    default_time_limit = task.time_limit
    default_soft_time_limit = task.soft_time_limit
    apply_async = consumer.pool.apply_async
    # print '=======-----', consumer, consumer.pool, apply_async
    acks_late = task.acks_late
    events = eventer and eventer.enabled
    # == END == Request var

    controller_revoked_tasks = consumer.controller.state.revoked

    task_name = task.name
    # celery_app = task._get_app()
    # task_send_task = task.send_task
    # log_exception = task.logger.exception
    # _logger = get_task_logger(task_name, save=True)
    # _info = _logger.info
    # _error = _logger.error
    _info = task.logger.info
    _error = task.logger.error

    # task_store_info = task.store_info
    # task_rate_limit = task.rate_limit
    # task_counter = task.counter
    get_task_info = task.brief
    # task_counter_key = task.counter_key
    # task_on_all_finished = task.on_all_finished

    # taskstore = TaskStore(task_name)

    task_save = app.tasks['xspider.save_log']

    def save_task_status(type_, task_id, data):
        type_, _, subject = type_.partition('-')
        if type_ != 'task' or not data:
            return

        data[subject] = datetime.now()
        data['task_id'] = task_id
        task_save.apply_async(('task', task_name, {task_id: data}))

    # dispatcher = consumer.event_dispatcher
    # if dispatcher.groups and 'project' not in dispatcher.groups:
    #     dispatcher.groups.add('project')
    #     info('Events of group {project} enabled by local.')

    class BaseReq(Request):
        def __init__(self, *args, **kwargs):
            super(BaseReq, self).__init__(*args, **kwargs)
            self._args, self._kwargs, self._embed = self._payload

        def execute_using_pool(self, pool, **kwargs):
            task_id = self.id
            if (self.expires or task_id in revoked_tasks) and self.revoked():
                raise TaskRevokedError(task_id)

            time_limit, soft_time_limit = self.time_limits
            result = pool.apply_async(
                trace_task_ret,
                args=(self.type, task_id, self.request_dict, self.body,
                      self.content_type, self.content_encoding),
                accept_callback=self.on_accepted,
                timeout_callback=self.on_timeout,
                callback=self.on_success,
                error_callback=self.on_failure,
                soft_timeout=soft_time_limit or default_soft_time_limit,
                timeout=time_limit or default_time_limit,
                correlation_id=task_id,
            )
            # cannot create weakref to None
            # pylint: disable=attribute-defined-outside-init
            self._apply_result = maybe(ref, result)
            return result

        def on_success(self, failed__retval__runtime, **kwargs):
            failed, retval, runtime = failed__retval__runtime
            if failed:
                if isinstance(retval.exception,
                              (SystemExit, KeyboardInterrupt)):
                    raise retval.exception
                return self.on_failure(retval, return_ok=True)
            task_ready(self)

            if acks_late:
                self.acknowledge()

            if events:
                self.send_event(
                    'task-succeeded',
                    result=retval,
                    runtime=runtime,
                )

        def send_event(self, type_, **fields):
            super(BaseReq, self).send_event(type_, **fields)
            if type_ == 'task-succeeded':
                try:
                    if 'result' in fields:
                        fields['result'] = json.dumps(fields['result'])
                except Exception:
                    pass
            # taskstore.save(type_, self.id, fields)
            save_task_status(type_, self.id, fields)

        def task_info(self):
            info = get_task_info(self._args, self._kwargs)
            info['task_id'] = self.id
            info['task_name'] = task_name
            info['worker'] = self.hostname
            return info

    def task_message_handler(message,
                             body,
                             ack,
                             reject,
                             callbacks,
                             to_timestamp=to_timestamp):
        # print('crawl_task_message_handler %s %s' % (task_name, repr(body)))
        body, headers, decoded, utc = (
            message.body,
            message.headers,
            False,
            True,
        )
        if not body_can_be_buffer:
            body = bytes(body) if isinstance(body, buffer_t) else body

        req = BaseReq(
            message,
            on_ack=ack,
            on_reject=reject,
            app=app,
            hostname=hostname,
            eventer=eventer,
            task=task,
            connection_errors=connection_errors,
            body=body,
            headers=headers,
            decoded=decoded,
            utc=utc,
        )
        # if _does_info:
        meta = req.task_info()
        taskinfo = {'meta': meta}
        _info(u'收到任务', extra=taskinfo)

        if (req.expires
                or req.id in controller_revoked_tasks) and req.revoked():
            return

        # req_args, req_kwargs, req_embed = req._payload
        if task_sends_events:
            send_event(
                'task-received',
                uuid=req.id,
                name=req.name,
                args=req.argsrepr,
                kwargs=req.kwargsrepr,
                root_id=req.root_id,
                parent_id=req.parent_id,
                retries=req.request_dict.get('retries', 0),
                eta=req.eta and req.eta.isoformat(),
                expires=req.expires and req.expires.isoformat(),
            )

        # 保存
        # ti = get_task_info(req._args, req._kwargs)
        fields = dict(
            name=req.name,
            # project=req._project, page=req._page, url=req._url,
            kwargs=json.dumps(req._kwargs),
            # args=req_args, kwargs=req_kwargs,
            root_id=req.root_id,
            parent_id=req.parent_id,
            retries=req.request_dict.get('retries', 0),
            eta=req.eta and req.eta.isoformat(),
            expires=req.expires and req.expires.isoformat(),
            meta=meta)
        save_task_status('task-received', req.id, fields)

        # 限速
        if req._kwargs.get('__limit__'):
            try:
                key = 'rate:%s' % meta['project']
                pending = get_expected_time(key)
                # print '----Rate limit pending: %s %r' % (req.id, pending)
                if pending > 0:
                    req.eta = maybe_make_aware(datetime.utcnow() +
                                               timedelta(seconds=pending))
                    info('Rate Limit [%s.%s] %s', meta['project'],
                         meta['page'], pending)
            except Exception:
                error('Rate limit. Task: %r',
                      req.info(safe=True),
                      exc_info=True)

        if req.eta:
            try:
                if req.utc:
                    eta = to_timestamp(to_system_tz(req.eta))
                else:
                    eta = to_timestamp(req.eta, timezone.local)
            except (OverflowError, ValueError):
                error("Couldn't convert ETA %r to timestamp. Task: %r",
                      req.eta,
                      req.info(safe=True),
                      exc_info=True)
                req.reject(requeue=False)
            else:
                consumer.qos.increment_eventually()
                call_at(eta, apply_eta_task, (req, ), priority=6)
        else:
            if rate_limits_enabled:
                bucket = get_bucket(task.name)
                if bucket:
                    return limit_task(req, bucket, 1)
            task_reserved(req)
            if callbacks:
                [callback(req) for callback in callbacks]
            handle(req)

    return task_message_handler
Ejemplo n.º 12
0
 def initialize(self):
     self.redis_run = get_connection('run')
     self.influx_stat = get_connection('stat')
Ejemplo n.º 13
0
# coding=utf8

import time
import six
from hashlib import sha1
from redis.exceptions import NoScriptError
from yunduo.resource import get_connection

redis_conf = get_connection('conf')
RATE_LIMIT_SCRIPT = '''\
local key, now, token = KEYS[1], tonumber(ARGV[1]), tonumber(ARGV[2])
local timestamp, fill_rate, capacity, tokens, rhold = 0, 0, 0, 0, 0
local vals = redis.call("hgetall", key)

for i = 1, #vals, 2 do
    if     vals[i] == "timestamp" then timestamp = tonumber(vals[i+1])
    elseif vals[i] == "fill_rate" then fill_rate = tonumber(vals[i+1])
    elseif vals[i] == "capacity"  then capacity  = tonumber(vals[i+1])
    elseif vals[i] == "tokens"    then tokens    = tonumber(vals[i+1])
    elseif vals[i] == "rhold"     then rhold     = tonumber(vals[i+1])
    end
end
if fill_rate == 0 then
    return 0
end

local delta = fill_rate * (now - timestamp)
rhold = rhold - delta
if rhold < 0 then
    rhold = 0
end
Ejemplo n.º 14
0
 def __init__(self):
     self.expire = xconf.get('df_expire', 1296000)
     self.redisobj = get_connection('df')
Ejemplo n.º 15
0
 def __init__(self, project, job, batch_id):
     super(JobAction, self).__init__()
     self.project = project
     self.job = job
     self.batch_id = batch_id
     self.redis_run = get_connection('run')