def __init__(self, **kw): self.recipe = kw.pop('recipe_obj') self.sous_chef_path = kw.pop('sous_chef_path') self.sous_chef_kwargs = kw self.kw_prefix = settings.MERLYNNE_KWARGS_PREFIX self.kw_ttl = settings.MERLYNNE_KWARGS_TTL self.result_ttl = settings.MERLYNNE_RESULTS_TTL self.q = queues.get('recipe')
def __init__(self, **kw): self.recipe = kw.pop('recipe_obj') self.sous_chef_path = kw.pop('sous_chef_path') self.sous_chef_kwargs = kw self.kw_prefix = settings.MERLYNNE_KWARGS_PREFIX self.kw_ttl = settings.MERLYNNE_KWARGS_TTL self.result_ttl = settings.MERLYNNE_RESULTS_TTL self.passthrough = kw.get('passthrough', False) self.q = queues.get('recipe')
def bulkload(data, **kw): """ Bulk Load any data. """ kw['src'] = kw.pop('q_src', kw.pop('src', None)) if not kw['src']: raise ValueError('Missing src.') job_id = gen_uuid() # set queue defaults qkw = dict( queued=kw.pop('queued', True), job_id=job_id, timeout=kw.pop('q_timeout', 1000), serializer=kw.pop('q_serializer', 'json'), result_ttl=kw.pop('q_result_ttl', 60), kwargs_ttl=kw.pop('q_kwargs_ttl', 120), name=kw.pop('q_name', 'bulk'), max_workers=kw.pop('q_max_workers', MAX_WORKERS), job_key_fmt=kw.pop('q_job_key', 'rq:{src}:bulk:'.format(**kw)+"{}"), chunk_size=kw.pop('q_chunk_size', MAX_CHUNK_SIZE) ) kw.update({'queued': qkw.get('queued', True)}) # if this is not a queued job, just run ingest. if not qkw.get('queued'): return ingest.source(data, **kw) q = queues.get(qkw.pop('name', 'bulk')) # store the data + kwargs in redis temporarily # this makes the enqueuing process much, much more # efficient by allowing us to only pass a single key # into the queue rather than a massive dump of data # however it also means that all kwargs must be # json serializable job_key = qkw['job_key_fmt'].format(job_id) job = {'data': data, 'kw': kw} if qkw['serializer'] == 'json': job = obj_to_json(job) elif qkw['serializer'] == 'pickle': job = obj_to_pickle(job) rds.set(job_key, job, ex=qkw['kwargs_ttl']) q.enqueue(bulkworker, job_id, **qkw) return job_id
def bulkload(data, **kw): """ Bulk Load any data. """ kw['src'] = kw.pop('q_src', kw.pop('src', None)) if not kw['src']: raise ValueError('Missing src.') job_id = gen_uuid() # set queue defaults qkw = dict(queued=kw.pop('queued', True), job_id=job_id, timeout=kw.pop('q_timeout', 1000), serializer=kw.pop('q_serializer', 'json'), result_ttl=kw.pop('q_result_ttl', 60), kwargs_ttl=kw.pop('q_kwargs_ttl', 120), name=kw.pop('q_name', 'bulk'), max_workers=kw.pop('q_max_workers', MAX_WORKERS), job_key_fmt=kw.pop('q_job_key', 'rq:{src}:bulk:'.format(**kw) + "{}"), chunk_size=kw.pop('q_chunk_size', MAX_CHUNK_SIZE)) kw.update({'queued': qkw.get('queued', True)}) # if this is not a queued job, just run ingest. if not qkw.get('queued'): return ingest.source(data, **kw) q = queues.get(qkw.pop('name', 'bulk')) # store the data + kwargs in redis temporarily # this makes the enqueuing process much, much more # efficient by allowing us to only pass a single key # into the queue rather than a massive dump of data # however it also means that all kwargs must be # json serializable job_key = qkw['job_key_fmt'].format(job_id) job = {'data': data, 'kw': kw} if qkw['serializer'] == 'json': job = obj_to_json(job) elif qkw['serializer'] == 'pickle': job = obj_to_pickle(job) rds.set(job_key, job, ex=qkw['kwargs_ttl']) q.enqueue(bulkworker, job_id, **qkw) return job_id
def get_status(user, job_id): """ Get the status of a queued job. """ # parse args. queue = request.args.get('queue') if not queue: raise RequestError( 'You must pass in the queue name to fetch a job\'s status') if not queue in queues: raise RequestError('"{}" is not a valid queue.'.format(queue)) q = queues.get(queue) job = q.fetch_job(job_id) if not job: raise RequestError('A job with ID {} does not exist'.format(job_id)) # fetch metadata about this job # from the session # parse args. started = request.args.get('started') orig_url = request.args.get('orig_url') if started: started = dates.parse_iso(started) # format return value ret = { 'job_id': job_id, 'queue': queue, 'status': None, 'started': started, 'orig_url': orig_url } # determine time since start if started: ret['time_since_start'] = (dates.now() - started).seconds # determine status if job.is_queued: ret['status'] = 'queued' if job.is_started: ret['status'] = 'running' if job.is_failed: ret['status'] = 'error' ret['message'] = "An unknown error occurred." if job.is_finished: rv = job.return_value # job will return true if successful if rv is True: ret['status'] = 'success' # job will return an error if unsuccessful else: ret['status'] = 'error' ret['message'] = rv.message return jsonify(ret)
def get_status(user, job_id): """ Get the status of a queued job. """ # parse args. queue = request.args.get('queue') if not queue: raise RequestError( 'You must pass in the queue name to fetch a job\'s status') if not queue in queues: raise RequestError( '"{}" is not a valid queue.' .format(queue)) q = queues.get(queue) job = q.fetch_job(job_id) if not job: raise RequestError( 'A job with ID {} does not exist' .format(job_id)) # fetch metadata about this job # from the session # parse args. started = request.args.get('started') orig_url = request.args.get('orig_url') if started: started = dates.parse_iso(started) # format return value ret = { 'job_id': job_id, 'queue': queue, 'status': None, 'started': started, 'orig_url': orig_url } # determine time since start if started: ret['time_since_start'] = (dates.now() - started).seconds # determine status if job.is_queued: ret['status'] = 'queued' if job.is_started: ret['status'] = 'running' if job.is_failed: ret['status'] = 'error' ret['message'] = "An unknown error occurred." if job.is_finished: rv = job.return_value # job will return true if successful if rv is True: ret['status'] = 'success' # job will return an error if unsuccessful else: ret['status'] = 'error' ret['message'] = str(rv.message) return jsonify(ret)
class BulkLoader(object): __module__ = 'newslynx.tasks.bulk' returns = None # either "model" or "query" timeout = 1000 # seconds result_ttl = 60 # seconds kwargs_ttl = 1000 # in case there is a backup in the queue max_workers = 7 concurrent = True kwargs_key = 'rq:kwargs:{}' q = queues.get('bulk') redis = rds def load_one(self, item, **kw): """ The method to overwrite. """ raise NotImplemented def _load_one(self, item, **kw): """ A wrapper which will catch errors and bubble them up """ try: return self.load_one(item, **kw) except Exception as e: return Exception(e.message) def _handle_errors(self, errors): if not isinstance(errors, list): errors = [errors] return RequestError('There was an error while bulk uploading: ' '{}'.format(errors[0].message)) def load_all(self, kwargs_key): """ Do the work. """ start = time.time() try: # create a session specific to this task session = gen_session() # get the inputs from redis kwargs = self.redis.get(kwargs_key) if not kwargs: raise InternalServerError( 'An unexpected error occurred while processing bulk upload.' ) kwargs = pickle_to_obj(kwargs) data = kwargs.get('data') kw = kwargs.get('kw') # delete them self.redis.delete(kwargs_key) outputs = [] errors = [] fx = partial(self._load_one, **kw) if self.concurrent: pool = Pool(min([len(data), self.max_workers])) for res in pool.imap_unordered(fx, data): if isinstance(res, Exception): errors.append(res) else: outputs.append(res) else: for item in data: res = fx(item) if isinstance(res, Exception): errors.append(res) else: outputs.append(res) # return errors if len(errors): self._handle_errors(errors) # add objects and execute if self.returns == 'model': for o in outputs: if o is not None: try: session.add(o) session.commit(o) except Exception as e: self._handle_errors(e) # union all queries elif self.returns == 'query': for query in outputs: if query is not None: try: session.execute(query) except Exception as e: self._handle_errors(e) try: session.commit() except Exception as e: session.rollback() session.remove() self._handle_errors(e) # return true if everything worked. session.close() return True except JobTimeoutException: end = time.time() return InternalServerError( 'Bulk loading timed out after {} seconds'.format(end - start)) def run(self, data, **kw): # store the data + kwargs in redis temporarily # this makes the enqueuing process much, much more # efficient by allowing us to only pass a single key # into the queue rather than a massive dump of data # however it also means that all kwargs must be # json serializable job_id = gen_uuid() kwargs_key = self.kwargs_key.format(job_id) kwargs = {'data': data, 'kw': kw} self.redis.set(kwargs_key, obj_to_pickle(kwargs), ex=self.kwargs_ttl) # send the job to the task queue self.q.enqueue(self.load_all, kwargs_key, job_id=job_id, timeout=self.timeout, result_ttl=self.result_ttl) return job_id