Esempio n. 1
0
 def __init__(self, **kw):
     self.recipe = kw.pop('recipe_obj')
     self.sous_chef_path = kw.pop('sous_chef_path')
     self.sous_chef_kwargs = kw
     self.kw_prefix = settings.MERLYNNE_KWARGS_PREFIX
     self.kw_ttl = settings.MERLYNNE_KWARGS_TTL
     self.result_ttl = settings.MERLYNNE_RESULTS_TTL
     self.q = queues.get('recipe')
Esempio n. 2
0
 def __init__(self, **kw):
     self.recipe = kw.pop('recipe_obj')
     self.sous_chef_path = kw.pop('sous_chef_path')
     self.sous_chef_kwargs = kw
     self.kw_prefix = settings.MERLYNNE_KWARGS_PREFIX
     self.kw_ttl = settings.MERLYNNE_KWARGS_TTL
     self.result_ttl = settings.MERLYNNE_RESULTS_TTL
     self.passthrough = kw.get('passthrough', False)
     self.q = queues.get('recipe')
Esempio n. 3
0
def bulkload(data, **kw):
    """
    Bulk Load any data.
    """
    kw['src'] = kw.pop('q_src', kw.pop('src', None))
    if not kw['src']:
        raise ValueError('Missing src.')

    job_id = gen_uuid()

    # set queue defaults
    qkw = dict(
        queued=kw.pop('queued', True),
        job_id=job_id,
        timeout=kw.pop('q_timeout', 1000),
        serializer=kw.pop('q_serializer', 'json'),
        result_ttl=kw.pop('q_result_ttl', 60),
        kwargs_ttl=kw.pop('q_kwargs_ttl', 120),
        name=kw.pop('q_name', 'bulk'),
        max_workers=kw.pop('q_max_workers', MAX_WORKERS),
        job_key_fmt=kw.pop('q_job_key', 'rq:{src}:bulk:'.format(**kw)+"{}"),
        chunk_size=kw.pop('q_chunk_size', MAX_CHUNK_SIZE)
    )
    kw.update({'queued': qkw.get('queued', True)})

    # if this is not a queued job, just run ingest.
    if not qkw.get('queued'):
        return ingest.source(data, **kw)

    q = queues.get(qkw.pop('name', 'bulk'))

    # store the data + kwargs in redis temporarily
    # this makes the enqueuing process much, much more
    # efficient by allowing us to only pass a single key
    # into the queue rather than a massive dump of data
    # however it also means that all kwargs must be
    # json serializable
    job_key = qkw['job_key_fmt'].format(job_id)
    job = {'data': data, 'kw': kw}

    if qkw['serializer'] == 'json':
        job = obj_to_json(job)

    elif qkw['serializer'] == 'pickle':
        job = obj_to_pickle(job)

    rds.set(job_key, job, ex=qkw['kwargs_ttl'])

    q.enqueue(bulkworker, job_id, **qkw)
    return job_id
Esempio n. 4
0
def bulkload(data, **kw):
    """
    Bulk Load any data.
    """
    kw['src'] = kw.pop('q_src', kw.pop('src', None))
    if not kw['src']:
        raise ValueError('Missing src.')

    job_id = gen_uuid()

    # set queue defaults
    qkw = dict(queued=kw.pop('queued', True),
               job_id=job_id,
               timeout=kw.pop('q_timeout', 1000),
               serializer=kw.pop('q_serializer', 'json'),
               result_ttl=kw.pop('q_result_ttl', 60),
               kwargs_ttl=kw.pop('q_kwargs_ttl', 120),
               name=kw.pop('q_name', 'bulk'),
               max_workers=kw.pop('q_max_workers', MAX_WORKERS),
               job_key_fmt=kw.pop('q_job_key',
                                  'rq:{src}:bulk:'.format(**kw) + "{}"),
               chunk_size=kw.pop('q_chunk_size', MAX_CHUNK_SIZE))
    kw.update({'queued': qkw.get('queued', True)})

    # if this is not a queued job, just run ingest.
    if not qkw.get('queued'):
        return ingest.source(data, **kw)

    q = queues.get(qkw.pop('name', 'bulk'))

    # store the data + kwargs in redis temporarily
    # this makes the enqueuing process much, much more
    # efficient by allowing us to only pass a single key
    # into the queue rather than a massive dump of data
    # however it also means that all kwargs must be
    # json serializable
    job_key = qkw['job_key_fmt'].format(job_id)
    job = {'data': data, 'kw': kw}

    if qkw['serializer'] == 'json':
        job = obj_to_json(job)

    elif qkw['serializer'] == 'pickle':
        job = obj_to_pickle(job)

    rds.set(job_key, job, ex=qkw['kwargs_ttl'])

    q.enqueue(bulkworker, job_id, **qkw)
    return job_id
Esempio n. 5
0
def get_status(user, job_id):
    """
    Get the status of a queued job.
    """

    # parse args.
    queue = request.args.get('queue')
    if not queue:
        raise RequestError(
            'You must pass in the queue name to fetch a job\'s status')

    if not queue in queues:
        raise RequestError('"{}" is not a valid queue.'.format(queue))

    q = queues.get(queue)
    job = q.fetch_job(job_id)
    if not job:
        raise RequestError('A job with ID {} does not exist'.format(job_id))

    # fetch metadata about this job
    # from the session
    # parse args.
    started = request.args.get('started')
    orig_url = request.args.get('orig_url')

    if started:
        started = dates.parse_iso(started)

    # format return value
    ret = {
        'job_id': job_id,
        'queue': queue,
        'status': None,
        'started': started,
        'orig_url': orig_url
    }

    # determine time since start
    if started:
        ret['time_since_start'] = (dates.now() - started).seconds

    # determine status
    if job.is_queued:
        ret['status'] = 'queued'

    if job.is_started:
        ret['status'] = 'running'

    if job.is_failed:
        ret['status'] = 'error'
        ret['message'] = "An unknown error occurred."

    if job.is_finished:
        rv = job.return_value

        # job will return true if successful
        if rv is True:
            ret['status'] = 'success'

        # job will return an error if unsuccessful
        else:
            ret['status'] = 'error'
            ret['message'] = rv.message

    return jsonify(ret)
Esempio n. 6
0
def get_status(user, job_id):
    """
    Get the status of a queued job.
    """

    # parse args.
    queue = request.args.get('queue')
    if not queue:
        raise RequestError(
            'You must pass in the queue name to fetch a job\'s status')

    if not queue in queues:
        raise RequestError(
            '"{}" is not a valid queue.'
            .format(queue))

    q = queues.get(queue)
    job = q.fetch_job(job_id)
    if not job:
        raise RequestError(
            'A job with ID {} does not exist'
            .format(job_id))

    # fetch metadata about this job
    # from the session
    # parse args.
    started = request.args.get('started')
    orig_url = request.args.get('orig_url')

    if started:
        started = dates.parse_iso(started)

    # format return value
    ret = {
        'job_id': job_id,
        'queue': queue,
        'status': None,
        'started': started,
        'orig_url': orig_url
    }

    # determine time since start
    if started:
        ret['time_since_start'] = (dates.now() - started).seconds

    # determine status
    if job.is_queued:
        ret['status'] = 'queued'

    if job.is_started:
        ret['status'] = 'running'

    if job.is_failed:
        ret['status'] = 'error'
        ret['message'] = "An unknown error occurred."

    if job.is_finished:
        rv = job.return_value

        # job will return true if successful
        if rv is True:
            ret['status'] = 'success'

        # job will return an error if unsuccessful
        else:
            ret['status'] = 'error'
            ret['message'] = str(rv.message)

    return jsonify(ret)
Esempio n. 7
0
class BulkLoader(object):

    __module__ = 'newslynx.tasks.bulk'

    returns = None  # either "model" or "query"
    timeout = 1000  # seconds
    result_ttl = 60  # seconds
    kwargs_ttl = 1000  # in case there is a backup in the queue
    max_workers = 7
    concurrent = True
    kwargs_key = 'rq:kwargs:{}'
    q = queues.get('bulk')
    redis = rds

    def load_one(self, item, **kw):
        """
        The method to overwrite.
        """
        raise NotImplemented

    def _load_one(self, item, **kw):
        """
        A wrapper which will catch errors
        and bubble them up
        """
        try:
            return self.load_one(item, **kw)
        except Exception as e:
            return Exception(e.message)

    def _handle_errors(self, errors):
        if not isinstance(errors, list):
            errors = [errors]
        return RequestError('There was an error while bulk uploading: '
                            '{}'.format(errors[0].message))

    def load_all(self, kwargs_key):
        """
        Do the work.
        """
        start = time.time()
        try:
            # create a session specific to this task
            session = gen_session()

            # get the inputs from redis
            kwargs = self.redis.get(kwargs_key)
            if not kwargs:
                raise InternalServerError(
                    'An unexpected error occurred while processing bulk upload.'
                )

            kwargs = pickle_to_obj(kwargs)
            data = kwargs.get('data')
            kw = kwargs.get('kw')

            # delete them
            self.redis.delete(kwargs_key)

            outputs = []
            errors = []

            fx = partial(self._load_one, **kw)

            if self.concurrent:
                pool = Pool(min([len(data), self.max_workers]))
                for res in pool.imap_unordered(fx, data):
                    if isinstance(res, Exception):
                        errors.append(res)
                    else:
                        outputs.append(res)
            else:
                for item in data:
                    res = fx(item)
                    if isinstance(res, Exception):
                        errors.append(res)
                    else:
                        outputs.append(res)

            # return errors
            if len(errors):
                self._handle_errors(errors)

            # add objects and execute
            if self.returns == 'model':
                for o in outputs:
                    if o is not None:
                        try:
                            session.add(o)
                            session.commit(o)
                        except Exception as e:
                            self._handle_errors(e)

            # union all queries
            elif self.returns == 'query':
                for query in outputs:
                    if query is not None:
                        try:
                            session.execute(query)
                        except Exception as e:
                            self._handle_errors(e)

            try:
                session.commit()

            except Exception as e:
                session.rollback()
                session.remove()
                self._handle_errors(e)

            # return true if everything worked.
            session.close()
            return True

        except JobTimeoutException:
            end = time.time()
            return InternalServerError(
                'Bulk loading timed out after {} seconds'.format(end - start))

    def run(self, data, **kw):

        # store the data + kwargs in redis temporarily
        # this makes the enqueuing process much, much more
        # efficient by allowing us to only pass a single key
        # into the queue rather than a massive dump of data
        # however it also means that all kwargs must be
        # json serializable
        job_id = gen_uuid()
        kwargs_key = self.kwargs_key.format(job_id)
        kwargs = {'data': data, 'kw': kw}
        self.redis.set(kwargs_key, obj_to_pickle(kwargs), ex=self.kwargs_ttl)

        # send the job to the task queue
        self.q.enqueue(self.load_all,
                       kwargs_key,
                       job_id=job_id,
                       timeout=self.timeout,
                       result_ttl=self.result_ttl)

        return job_id