def hdf2(hdf1): #taskset = TaskSet(tasks.hdf2geotiff(hdf1)) taskset = TaskSet(tasks.hdf2geotiff.subtask((x, )) for x in hdf1) taskset_result = taskset.apply_async() results = taskset_result.join_native()
def bulk_process(request): "Process a bulk form" if request.method == 'POST': form = BulkQuarantineProcessForm(request.POST) choices = request.session['quarantine_choices'] form.fields['message_id']._choices = choices if form.is_valid(): messages = Message.objects.values( 'id', 'from_address', 'date', 'hostname', 'to_address').filter(id__in=form.cleaned_data['message_id']) del form.cleaned_data['message_id'] formvals = [] for message in messages: message.update(form.cleaned_data) message['date'] = str(message['date']) message['message_id'] = message['id'] del message['id'] formvals.append(message) taskset = TaskSet(tasks=[ ProcessQuarantinedMsg.subtask( args=[formval], options=dict(queue=formval['hostname'])) for formval in formvals ]) task = taskset.apply_async() task.save() return HttpResponseRedirect( reverse('task-status', args=[task.taskset_id])) msg = _('System was unable to process your request') djmessages.info(request, msg) return HttpResponseRedirect( reverse('all-messages-index', args=['quarantine']))
def run(self, flavor, repo_path, data): backend = load_backend(flavor, repo_path, cache=False) out = StringIO() proto = ReceivableProtocol(StringIO(data).read, out.write) handler = _ReceivePackHandler(WebBackend(), [backend], proto, stateless_rpc=True) handler.handle() sync_tasks = [] for oldrev, newrev, name in handler._good_refs: if name.startswith('refs/heads/'): branch = name[11:] sync_tasks.append( subtask(SyncTask, args=[ backend.flavor, backend.path, oldrev, newrev, branch ])) if sync_tasks: taskset = TaskSet(tasks=sync_tasks) taskset.apply_async().join() return out.getvalue(), handler._good_refs
def bags_migrate_s3(mongo_host='oulib_mongo'): #catalog db=MongoClient(mongo_host) #Celery Worker storage connections celery_worker_hostname = os.getenv('celery_worker_hostname', "dev-mstacy") celery_config=db.catalog.celery_worker_config.find_one({"celery_worker":celery_worker_hostname}) #get variable by celory worker norfile_bagit=celery_config['norfile']['bagit'] s3_bucket=celery_config['s3']['bucket'] subtasks=[] check_catalog=[] s3 = boto3.client('s3') for itm in db.catalog.bagit_inventory.find({"s3.exists":False}): #double check to make sure not already in s3 s3_key = s3.list_objects(Bucket=s3_bucket, Prefix=itm['bag'] ,MaxKeys=1) if not 'Contents' in s3_key: subtasks.append(upload_bag_s3.subtask(args=(itm['bag'],norfile_bagit))) else: check_catalog.append(itm['bag']) if subtasks: job = TaskSet(tasks=subtasks) result_set = job.apply_async() check=",".join(check_catalog) return "{0} subtasks('upload_bag_s3') submitted. Check Catalog: {1}".format(len(subtasks),check)
def update_user(user, requester): """ Fetch new weeks, or possibly those that failed before.""" # TODO: fail here if couldn't contact last.fm # Have to fetch the chart list from last.fm because their timestamps are awkward, especially # those on the first few charts released. chart_list = fetch_chart_list(user.username, requester) successful_requests = Update.objects.weeks_fetched(user) # create taskset and run it. update_tasks = [] updates = [] with transaction.commit_on_success(): for start, end in chart_list: idx = ldates.index_of_timestamp(end) # Skip if this week is before the user signed up if not idx < user.first_sunday_with_data: # skip if data has already been successfully fetched if (idx, Update.ARTIST) not in successful_requests: update = Update(user=user, week_idx=idx, type=Update.ARTIST) updates.append(update) update_tasks.append(fetch_week_data.subtask((user, requester, start, end, Update.ARTIST))) # if (idx, Update.TRACK) not in successful_requests: # Update.objects.create(user=user, week_idx=idx, type=Update.TRACK) # update_tasks.append(fetch_week_data.subtask((user, requester, start, end, Update.TRACK))) Update.objects.bulk_create(updates) ts = TaskSet(update_tasks) ts.apply_async() user.last_updated = date.today() user.save() return len(update_tasks) > 0
def test_interface__compat(self): warnings.resetwarnings() with catch_warnings(record=True) as log: ts = TaskSet(MockTask, [[(2, 2)], [(4, 4)], [(8, 8)]]) self.assertTrue(log) self.assertIn("Using this invocation of TaskSet is deprecated", log[0].message.args[0]) self.assertListEqual(ts.tasks, [MockTask.subtask((i, i)) for i in (2, 4, 8)]) return ts # TaskSet.task (deprecated) with catch_warnings(record=True) as log: ts = TaskSet(MockTask, [[(2, 2)], [(4, 4)], [(8, 8)]]) self.assertEqual(ts.task.name, MockTask.name) self.assertTrue(log) self.assertIn("TaskSet.task is deprecated", log[0].message.args[0]) # TaskSet.task_name (deprecated) with catch_warnings(record=True) as log: ts = TaskSet(MockTask, [[(2, 2)], [(4, 4)], [(8, 8)]]) self.assertEqual(ts.task_name, MockTask.name) self.assertTrue(log) self.assertIn("TaskSet.task_name is deprecated", log[0].message.args[0])
def bulk_process(request): "Process a bulk form" if request.method == 'POST': form = BulkQuarantineProcessForm(request.POST) choices = request.session['quarantine_choices'] form.fields['message_id']._choices = choices if form.is_valid(): messages = Message.objects.values('id', 'from_address', 'date', 'hostname', 'to_address').filter(id__in=form.cleaned_data['message_id']) del form.cleaned_data['message_id'] formvals = [] for message in messages: message.update(form.cleaned_data) message['date'] = str(message['date']) message['message_id'] = message['id'] del message['id'] formvals.append(message) taskset = TaskSet(tasks=[ProcessQuarantinedMsg.subtask( args=[formval], options=dict(queue=formval['hostname'])) for formval in formvals]) task = taskset.apply_async() task.save() return HttpResponseRedirect(reverse('task-status', args=[task.taskset_id])) msg = _('System was unable to process your request') djmessages.info(request, msg) return HttpResponseRedirect(reverse('all-messages-index', args=['quarantine']))
def update_documents(self, documents, count): sys.stdout.write('Graph size is {0:d} nodes.\n'.format(count)) sys.stdout.flush() processed_count = 0 subtasks = [] timings = [] average_per_s = 0 if self.index == 'concurrently': index_during_subtask = True else: index_during_subtask = False for doc in documents: processed_count += 1 subtasks.append( update_document.subtask((doc, index_during_subtask))) if processed_count % 1000 == 1: t1 = time.time() if processed_count % 1000 == 0: t2 = time.time() timings.append(t2 - t1) average_per_s = 1000 / (sum(timings) / float(len(timings))) sys.stdout.write( "\rProcessing items in Celery queue: {:.0%} ({}/{}, {:.1f}/s, Last id: {})" .format( processed_count * 1.0 / count, processed_count, count, average_per_s, doc.pk, )) sys.stdout.flush() last_document = (count == processed_count) if (processed_count % 50 == 0) or last_document: # Every 50 documents, we send the subtasks off for processing # Poll to see when they're done. job = TaskSet(tasks=subtasks) result = job.apply_async() while not result.ready(): time.sleep(1) # The jobs finished - clean things up for the next round subtasks = [] if self.index == 'all_at_end': call_command( 'cl_update_index', '--type', 'opinions', '--solr-url', settings.SOLR_OPINION_URL, '--noinput', '--update', '--everything', '--do-commit', ) elif self.index == 'False': sys.stdout.write("Solr index not updated after running citation " "finder. You may want to do so manually.")
def map_reduce(task, task_args, agg, acc): """ Given a task and an iterable of positional arguments, apply the task function to the arguments in parallel and return an aggregate result depending on the initial value of the accumulator and on the aggregation function. To save memory, the order is not preserved and there is no list with the intermediated results: the accumulator is incremented as soon as a task result comes. NB: if the environment variable OQ_NO_DISTRIBUTE is set the tasks are run sequentially in the current process and then map_reduce(task, task_args, agg, acc) is the same as reduce(agg, itertools.starmap(task, task_args), acc). Users of map_reduce should be aware of the fact that when thousands of tasks are spawned and large arguments are passed or large results are returned they may incur in memory issue: this is way the calculators limit the queue with the `concurrent_task` concept. :param task: a `celery` task callable. :param task_args: an iterable over positional arguments :param agg: the aggregation function, (acc, val) -> new acc :param acc: the initial value of the accumulator :returns: the final value of the accumulator """ if no_distribute(): for the_args in task_args: result, exctype = safely_call(task.task_func, the_args) if exctype: raise RuntimeError(result) acc = agg(acc, result) else: backend = current_app().backend unpik = 0 job_id = task_args[0][0] taskname = task.__name__ mon = LightMonitor("unpickling %s" % taskname, job_id, task) to_send = 0 pickled_args = [] for args in task_args: piks = pickle_sequence(args) pickled_args.append(piks) to_send += sum(len(p) for p in piks) logs.LOG.info("Sending %dM", to_send / ONE_MB) taskset = TaskSet(tasks=map(task.subtask, pickled_args)) for task_id, result_dict in taskset.apply_async().iter_native(): check_mem_usage() # log a warning if too much memory is used result_pik = result_dict["result"] with mon: result, exctype = result_pik.unpickle() if exctype: raise RuntimeError(result) unpik += len(result_pik) acc = agg(acc, result) del backend._cache[task_id] # work around a celery bug logs.LOG.info("Unpickled %dM of received data in %s seconds", unpik / ONE_MB, mon.duration) return acc
def update_documents(self, documents, count): sys.stdout.write('Graph size is {0:d} nodes.\n'.format(count)) sys.stdout.flush() processed_count = 0 subtasks = [] timings = [] average_per_s = 0 if self.index == 'concurrently': index_during_subtask = True else: index_during_subtask = False for doc in documents: processed_count += 1 if processed_count % 10000 == 0: # Send the commit every 10000 times. self.si.commit() subtasks.append(update_document.subtask((doc, index_during_subtask))) if processed_count % 1000 == 1: t1 = time.time() if processed_count % 1000 == 0: t2 = time.time() timings.append(t2 - t1) average_per_s = 1000 / (sum(timings) / float(len(timings))) sys.stdout.write("\rProcessing items in Celery queue: {:.0%} ({}/{}, {:.1f}/s, Last id: {})".format( processed_count * 1.0 / count, processed_count, count, average_per_s, doc.pk, )) sys.stdout.flush() last_document = (count == processed_count) if (processed_count % 50 == 0) or last_document: # Every 5000 documents, we send the subtasks off for processing # Poll to see when they're done. job = TaskSet(tasks=subtasks) result = job.apply_async() while not result.ready(): time.sleep(1) # The jobs finished - clean things up for the next round subtasks = [] if self.index == 'all_at_end': call_command( 'cl_update_index', '--type', 'opinions', '--solr-url', settings.SOLR_OPINION_URL, '--noinput', '--update', '--everything', '--do-commit', ) elif self.index == 'false': sys.stdout.write("Solr index not updated after running citation " "finder. You may want to do so manually.")
def dispatch_image(**params): the_subtasks = [] the_subtasks.append(McAllisterAnaglyphTask.subtask(**params)) the_subtasks.append(McAllisterAnaglyphTask.subtask(**params)) the_subtasks.append(McAllisterAnaglyphTask.subtask(**params)) the_subtasks.append(McAllisterAnaglyphTask.subtask(**params)) the_subtasks.append(McAllisterAnaglyphTask.subtask(**params)) job = TaskSet(tasks=the_subtasks) result = job.apply_async() return result
def SiteMaintenance(): session = DBSession() categories = session.query(Category) subtasks = [UpdateFromFeed.subtask((cat.feedurl, cat.idcategory)) for cat in categories] subtasks.append(UpdateDlpStats.subtask()) maintenanceJobs = TaskSet(tasks=subtasks) maintenanceJobs.apply_async().join() UpdateIndex.delay()
def get_multiple_clients_daily_rate(client_ids): tasks = [] for client_id in client_ids: y = get_clients_daily_rate.subtask((client_id, )) tasks.append(y) job = TaskSet(tasks=tasks) result = job.apply_async() result = result.join() return result
def fetch_feeds(callback=None): logging.warn("updating all feeds") feeds = Feed.objects.all() task_list = [] for feed in feeds: s = fetch_feed.subtask([feed.id]) task_list.append(s) fetch_all_tasks = TaskSet(tasks=task_list) fetch_all_tasks.apply_async() return None
def _exec_callbacks(callback): """ Exec the callback or list of callbacks. Return asyncronous results as the TaskSetResult object. """ async_result = None if callback: if not isinstance(callback, (list, tuple)): # not iterable callback = [callback,] taskset = TaskSet(tasks=callback) async_result = taskset.apply_async() return async_result
def test_apply(self): applied = [0] class mocksubtask(subtask): def apply(self, *args, **kwargs): applied[0] += 1 ts = TaskSet([mocksubtask(MockTask, (i, i)) for i in (2, 4, 8)]) ts.apply() self.assertEqual(applied[0], 3)
def make_pi_tasks(): taskset = TaskSet(tasks.make_pi.subtask((x, )) for x in NUM_CALCS) print "Dispatching tasks" taskset_result = taskset.apply_async() print "Waiting for results" results = taskset_result.join_native() print "Results:" for i in results: print i
def _run(fn_name, xs): fn = getattr(tasks, fn_name) job = TaskSet(tasks=[apply(fn.subtask, (x,)) for x in xs]) result = job.apply_async() while not result.ready(): time.sleep(5) out = [] for x in result.join(): if x: out.extend(x) return out
def test_apply(self): applied = [0] class mocksubtask(Signature): def apply(self, *args, **kwargs): applied[0] += 1 ts = TaskSet([mocksubtask(MockTask, (i, i)) for i in (2, 4, 8)]) ts.apply() self.assertEqual(applied[0], 3)
def run(self, set, body, interval=1, max_retries=None, **kwargs): if not isinstance(set, TaskSet): set = TaskSet(set) r = [] setid = gen_unique_id() for task in set.tasks: uuid = gen_unique_id() task.options.update(task_id=uuid, chord=body) r.append(current_app.AsyncResult(uuid)) current_app.TaskSetResult(setid, r).save() self.backend.on_chord_apply(setid, body, interval, max_retries) return set.apply_async(taskset_id=setid)
def update_documents(self, documents, count): sys.stdout.write('Graph size is {0:d} nodes.\n'.format(count)) sys.stdout.flush() processed_count = 0 subtasks = [] timings = [] average_per_s = 0 if self.index == 'concurrently': index_during_subtask = True else: index_during_subtask = False for doc in documents: processed_count += 1 if processed_count % 10000 == 0: # Send the commit every 10000 times. self.si.commit() subtasks.append(update_document.subtask((doc, index_during_subtask))) if processed_count % 1000 == 1: t1 = time.time() if processed_count % 1000 == 0: t2 = time.time() timings.append(t2 - t1) average_per_s = 1000 / (sum(timings) / float(len(timings))) sys.stdout.write("\rProcessing items in Celery queue: {:.0%} ({}/{}, {:.1f}/s)".format( processed_count * 1.0 / count, processed_count, count, average_per_s )) sys.stdout.flush() last_document = (count == processed_count) if (processed_count % 500 == 0) or last_document: # Every 500 documents, we send the subtasks off for processing # Poll to see when they're done. job = TaskSet(tasks=subtasks) result = job.apply_async() while not result.ready(): time.sleep(0.5) # The jobs finished - clean things up for the next round subtasks = [] if self.index == 'all_at_end': call_command( 'cl_update_index', update_mode=True, everything=True, solr_url='http://127.0.0.1:8983/solr/collection1' ) elif self.index == 'false': sys.stdout.write("Solr index not updated after running citation " "finder. You may want to do so manually.")
def _exec_callbacks(callback): """ Exec the callback or list of callbacks. Return asyncronous results as the TaskSetResult object. """ async_result = None if callback: if not isinstance(callback, (list, tuple)): # not iterable callback = [ callback, ] taskset = TaskSet(tasks=callback) async_result = taskset.apply_async() return async_result
def proccesing_pictrures(shop_id, **kwargs): """ Задача обработки картинок изображения """ picture_urls = {} tasks_image = [] for pict_url in db.Offers.find(spec={'shopId':shop_id}, fields=['id', 'picture'], slave_ok=True): picture_urls[pict_url['id']] = pict_url['picture'] tasks_image = (download_image.subtask(url=pict_url['picture'], id=pict_url['id'])) job = TaskSet(tasks=tasks_image) result = job.apply_async()#connection, connect_timeout, publisher, taskset_id) result.wait() result.join()
def _chunk_queryset_into_tasks(self, items, count, chunksize=5000, bundle_size=250): """Chunks the queryset passed in, and dispatches it to Celery for adding to the index. Potential performance improvements: - Postgres is quiescent when Solr is popping tasks from Celery, instead, it should be fetching the next 1,000 - The wait loop (while not result.ready()) polls for the results, at a 1s interval. Could this be reduced or somehow eliminated while keeping Celery's tasks list from running away? """ processed_count = 0 subtasks = [] item_bundle = [] for item in items: last_item = (count == processed_count + 1) if self.verbosity >= 2: self.stdout.write('Indexing item %s' % item.pk) item_bundle.append(item) if (processed_count % bundle_size == 0) or last_item: # Every bundle_size documents we create a subtask subtasks.append( add_or_update_items.subtask((item_bundle, self.solr_url))) item_bundle = [] processed_count += 1 if (processed_count % chunksize == 0) or last_item: # Every chunksize items, we send the subtasks for processing job = TaskSet(tasks=subtasks) result = job.apply_async() while not result.ready(): time.sleep(1) subtasks = [] if (processed_count % 50000 == 0) or last_item: # Do a commit every 50000 items, for good measure. self.stdout.write("...running commit command...") self.si.commit() sys.stdout.write("\rProcessed {}/{} ({:.0%})".format( processed_count, count, processed_count * 1.0 / count, )) self.stdout.flush() self.stdout.write('\n')
def run(self, content_producer_pk): content = ContentProducer.objects.get(pk=content_producer_pk) self.text = content.text_body conditions = content.conditions.all() end_execution = datetime.now() + timedelta(seconds=15) users = SimpleUser.objects.all() # Build the query with the given conditions for condition in conditions: filters = Q(**{condition.field: condition.value}) users = users.filter(filters) jobs = TaskSet(tasks=[send_sms.subtask((content.text_body, user,), expires=end_execution) for user in users]) jobs.apply_async()
def _refinery_file_import(analysis_uuid): """ Check on the status of the files being imported into Refinery. Fail the task appropriately if we cannot retrieve the status. """ analysis = _get_analysis(analysis_uuid) analysis_status = _get_analysis_status(analysis_uuid) if not analysis_status.refinery_import_task_group_id: logger.info("Starting analysis '%s'", analysis) analysis.set_status(Analysis.RUNNING_STATUS) logger.info("Starting input file import tasks for analysis '%s'", analysis) refinery_import_taskset = TaskSet( tasks=analysis.get_refinery_import_task_signatures()).apply_async( ) refinery_import_taskset.save() analysis_status.refinery_import_task_group_id = \ refinery_import_taskset.taskset_id analysis_status.save() run_analysis.retry(countdown=RETRY_INTERVAL) # check if all files were successfully imported into Refinery refinery_import_taskset = get_taskset_result( analysis_status.refinery_import_task_group_id) if not refinery_import_taskset.ready(): logger.debug("Input file import pending for analysis '%s'", analysis) run_analysis.retry(countdown=RETRY_INTERVAL) elif not refinery_import_taskset.successful(): error_msg = "Analysis '{}' failed during file import".format(analysis) logger.error(error_msg) analysis.set_status(Analysis.FAILURE_STATUS, error_msg) analysis.send_email() refinery_import_taskset.delete()
def test_apply_async(self): applied = [0] class mocksubtask(Signature): def apply_async(self, *args, **kwargs): applied[0] += 1 ts = TaskSet([mocksubtask(MockTask, (i, i)) for i in (2, 4, 8)]) ts.apply_async() self.assertEqual(applied[0], 3) class Publisher(object): def send(self, *args, **kwargs): pass ts.apply_async(publisher=Publisher()) # setting current_task @current_app.task def xyz(): pass from celery._state import _task_stack xyz.push_request() _task_stack.push(xyz) try: ts.apply_async(publisher=Publisher()) finally: _task_stack.pop() xyz.pop_request()
def startDownload(request): """ Ajax call to start the download """ if request.user.is_authenticated(): if request.method == 'POST': profile = request.user.profile if profile.stage > 0: if profile.stage < 3: result = TaskSetResult.restore(profile.task_id) response_data = { "error": "download already started", "stage" : profile.stage, "completed" : result.completed_count(), "total" : result.total, } else: reponse_data = { "error": "download already finished", "stage" : profile.stage, "state" : "completed", } else: graphapi = facebook.GraphAPI(profile.fblogin.access_token) me = graphapi.get_object('me') friends = [(f['id'],f['name']) for f in graphapi.get_connections('me','friends')['data']] friends.append((me['id'],me['name'])) subtasks = [tasks.dlUser.subtask((profile.id,graphapi,fbid,name)) for (fbid,name) in friends] result = TaskSet(tasks=subtasks).apply_async() result.save() profile.stage = 1 profile.task_id = result.taskset_id profile.save() r = tasks.checkTaskSet.delay(result,profile.id) response_data = { "stage":1, "completed": result.completed_count(), "total": result.total, } else: response_data = { "error": "must be a post request" } else: response_data = { "error": "user must be logged in" } return HttpResponse(json.dumps(response_data), mimetype="application/json")
def _run(fn_name, xs): fn = getattr(tasks, fn_name) job = TaskSet(tasks=[apply(fn.subtask, (x,)) for x in xs]) result = job.apply_async() out = [] if wait: with _close_taskset(result): while not result.ready(): time.sleep(5) if result.failed(): raise ValueError("Failed distributed task; cleaning up") for x in result.join(): if x: out.extend(x) return out
def checkTaskSet(result,profile_id): if result.ready(): profile = Profile.objects.get(id=profile_id) pages = profile.getActivePages() numpeople = profile.getActivePeople().count() subtasks = [calcPMIs.subtask((profile_id,page.id,numpeople)) for page in pages] r = TaskSet(tasks=subtasks).apply_async() r.save() r2 = checkPMISet.delay(r,profile.id) profile.stage = 2 profile.task_id = r.taskset_id profile.save() else: checkTaskSet.retry(countdown=15, max_retries=100)
def testDl(profile_id): profile = Profile.objects.get(id=profile_id) graphapi = facebook.GraphAPI(profile.fblogin.access_token) me = graphapi.get_object('me') friends = [(f['id'],f['name']) for f in graphapi.get_connections('me','friends')['data']] friends.append((me['id'],me['name'])) subtasks = [dlUser.subtask((profile.id,graphapi,fbid,name)) for (fbid,name) in friends] result = TaskSet(tasks=subtasks).apply_async() result.save() profile.stage = 1 profile.task_id = result.taskset_id profile.save() r = checkTaskSet.delay(result,profile.id) return result
def _chunk_queryset_into_tasks(self, items, count, chunksize=5000, bundle_size=250): """Chunks the queryset passed in, and dispatches it to Celery for adding to the index. Potential performance improvements: - Postgres is quiescent when Solr is popping tasks from Celery, instead, it should be fetching the next 1,000 - The wait loop (while not result.ready()) polls for the results, at a 1s interval. Could this be reduced or somehow eliminated while keeping Celery's tasks list from running away? """ processed_count = 0 subtasks = [] item_bundle = [] for item in items: last_item = (count == processed_count + 1) if self.verbosity >= 2: self.stdout.write('Indexing item %s' % item.pk) item_bundle.append(item) if (processed_count % bundle_size == 0) or last_item: # Every bundle_size documents we create a subtask subtasks.append(add_or_update_items.subtask( (item_bundle, self.solr_url))) item_bundle = [] processed_count += 1 if (processed_count % chunksize == 0) or last_item: # Every chunksize items, we send the subtasks for processing job = TaskSet(tasks=subtasks) result = job.apply_async() while not result.ready(): time.sleep(1) subtasks = [] if (processed_count % 50000 == 0) or last_item: # Do a commit every 50000 items, for good measure. self.stdout.write("...running commit command...") self.si.commit() sys.stdout.write("\rProcessed {}/{} ({:.0%})".format( processed_count, count, processed_count * 1.0 / count, )) self.stdout.flush() self.stdout.write('\n')
def index_all_profiles(): # Get an es object, delete index and re-create it es = get_es(timeout=settings.ES_INDEXING_TIMEOUT) mappings = { 'mappings': { UserProfileMappingType.get_mapping_type_name(): UserProfileMappingType.get_mapping() } } def _recreate_index(index): es.indices.delete(index=index, ignore=[400, 404]) es.indices.create(index, body=mappings) _recreate_index(settings.ES_INDEXES['default']) _recreate_index(settings.ES_INDEXES['public']) # mozillians index ids = UserProfile.objects.complete().values_list('id', flat=True) ts = [ index_objects.subtask(args=[UserProfileMappingType, chunk, 150, False]) for chunk in chunked(sorted(list(ids)), 150) ] # public index ts += [ index_objects.subtask(args=[UserProfileMappingType, chunk, 150, True]) for chunk in chunked(sorted(list(ids)), 150) ] TaskSet(ts).apply_async()
def index_all_profiles(): # Get an es object, delete index and re-create it index = settings.ES_INDEXES['default'] es = get_es(timeout=settings.ES_INDEXING_TIMEOUT) try: es.delete_index_if_exists(index) except pyes.exceptions.IndexMissingException: pass mappings = { 'mappings': { UserProfile._meta.db_table: UserProfile.get_mapping() } } es.create_index(index, settings=mappings) ids = (UserProfile.objects.exclude(full_name='').values_list('id', flat=True)) ts = [ tasks.index_objects.subtask(args=[UserProfile, chunk]) for chunk in chunked(sorted(list(ids)), 150) ] TaskSet(ts).apply_async()
def addon_total_contributions(): addons = Addon.objects.values_list('id', flat=True) ts = [ tasks.addon_total_contributions.subtask(args=chunk) for chunk in chunked(addons, 100) ] TaskSet(ts).apply_async()
def unsubscribe_from_basket(modeladmin, request, queryset): """Unsubscribe from Basket.""" ts = [(unsubscribe_from_basket_task.subtask(args=[userprofile.user.email, [newsletter]])) for userprofile in queryset] TaskSet(ts).apply_async() messages.success(request, 'Basket update started.')
def index_all_profiles(): ids = (UserProfile.objects.values_list('id', flat=True)) ts = [ tasks.index_objects.subtask(args=[UserProfile, chunk]) for chunk in chunked(sorted(list(ids)), 150) ] TaskSet(ts).apply_async()
def unsubscribe_from_basket(modeladmin, request, queryset): """Unsubscribe from Basket.""" ts = [(mozillians.users.tasks.remove_from_basket_task .subtask(args=[userprofile.user.email, userprofile.basket_token])) for userprofile in queryset] TaskSet(ts).apply_async() messages.success(request, 'Basket update started.')
def reindex_users(index=None): from . import tasks ids = UserProfile.objects.values_list('id', flat=True) taskset = [tasks.index_users.subtask(args=[chunk], kwargs=dict(index=index)) for chunk in chunked(sorted(list(ids)), 150)] TaskSet(taskset).apply_async()
def subscribe_to_basket(modeladmin, request, queryset): """Subscribe to Basket or update details of already subscribed.""" ts = [(mozillians.users.tasks.update_basket_task .subtask(args=[userprofile.id])) for userprofile in queryset] TaskSet(ts).apply_async() messages.success(request, 'Basket update started.')
def index_all_profiles(): # Get an es object, delete index and re-create it es = get_es(timeout=settings.ES_INDEXING_TIMEOUT) mappings = {'mappings': {UserProfile._meta.db_table: UserProfile.get_mapping()}} def _recreate_index(index): try: es.delete_index_if_exists(index) except pyes.exceptions.IndexMissingException: pass es.create_index(index, settings=mappings) _recreate_index(settings.ES_INDEXES['default']) _recreate_index(settings.ES_INDEXES['public']) # mozillians index ids = UserProfile.objects.complete().values_list('id', flat=True) ts = [index_objects.subtask(args=[UserProfile, chunk, False]) for chunk in chunked(sorted(list(ids)), 150)] # public index ids = (UserProfile.objects.complete().public_indexable() .privacy_level(PUBLIC).values_list('id', flat=True)) ts += [index_objects.subtask(args=[UserProfile, chunk, True]) for chunk in chunked(sorted(list(ids)), 150)] TaskSet(ts).apply_async()
def update_user_ratings(): """Update add-on author's ratings.""" cursor = connections[multidb.get_slave()].cursor() # We build this query ahead of time because the cursor complains about data # truncation if it does the parameters. Also, this query is surprisingly # quick, <1sec for 6100 rows returned q = """ SELECT addons_users.user_id as user_id, AVG(rating) as avg_rating FROM reviews INNER JOIN versions INNER JOIN addons_users INNER JOIN addons ON reviews.version_id = versions.id AND addons.id = versions.addon_id AND addons_users.addon_id = addons.id WHERE reviews.reply_to IS NULL AND reviews.rating > 0 AND addons.status IN (%s) GROUP BY addons_users.user_id """ % (",".join(map(str, VALID_STATUSES))) cursor.execute(q) d = cursor.fetchall() cursor.close() ts = [update_user_ratings_task.subtask(args=[chunk]) for chunk in chunked(d, 1000)] TaskSet(ts).apply_async()
def get_clients_with_prepaid_accounts(): sql = text(""" SELECT DISTINCT(leads_id) FROM subcontractors WHERE prepaid = 'yes' """) conn = engine.connect() r = conn.execute(sql).fetchall() client_ids = [] for x in r: client_ids.append(x[0]) #retrieve leads_id of timesheet records for the past 90 days #this is to prevent subcontractors record recently terminated and does not show up on adjustment now = get_ph_time() prev_date = now - timedelta(days=90) now_string = now.strftime('%F %T') prev_date_string = prev_date.strftime('%F %T') sql = text(""" SELECT DISTINCT(t.leads_id) FROM timesheet AS t JOIN subcontractors AS s ON t.leads_id = s.leads_id WHERE s.prepaid = 'yes' AND month_year BETWEEN :prev_date_string AND :now_string """) r = conn.execute(sql, prev_date_string=prev_date_string, now_string=now_string) for x in r: client_id = x[0] if client_id not in client_ids: client_ids.append(client_id) tasks = [] for client_id in client_ids: y = get_client_details.subtask((client_id, )) tasks.append(y) job = TaskSet(tasks=tasks) result = job.apply_async() data = result.join() conn.close() return data
def chord(set, body, interval=1, max_retries=None, propagate=False, **kwargs): if not isinstance(set, TaskSet): set = TaskSet(set) r = [] setid = uuid() for task in set.tasks: tid = uuid() task.options.update(task_id=tid, chord=body) r.append(app.AsyncResult(tid)) app.backend.on_chord_apply(setid, body, interval=interval, max_retries=max_retries, propagate=propagate, result=r) set.apply_async(taskset_id=setid)
def apply_async(self): tasks = [] for node in self.children: func = node.func args = node.args kwargs = node.kwargs callback = kwargs.pop('callback', []) if not isinstance(callback, (list, tuple)): callback = [callback] subtasks = node._get_child_tasks() callback += subtasks kwargs = dict(callback=callback, **kwargs) task = func.subtask(args=args, kwargs=kwargs) tasks.append(task) taskset = TaskSet(tasks) result = taskset.apply_async() return result
def invalidate_users(**kw): """Invalidate all users to reflect latest Marketplace access whitelist.""" from amo.utils import chunked from users.models import UserProfile log.info('Invalidating users for access whitelist.') d = UserProfile.objects.values_list('id', flat=True) ts = [_invalidate_users.subtask(args=[chunk]) for chunk in chunked(d, 100)] TaskSet(ts).apply_async()
def reindex_collections(index=None): from . import tasks ids = (Collection.objects.exclude(type=amo.COLLECTION_SYNCHRONIZED) .values_list('id', flat=True)) taskset = [tasks.index_collections.subtask(args=[chunk], kwargs=dict(index=index)) for chunk in chunked(sorted(list(ids)), 150)] TaskSet(taskset).apply_async()
def apply_async(self): tasks = [] for node in self.children: func = node.func args = node.args kwargs = node.kwargs callback = kwargs.pop('callback', []) if not isinstance(callback, (list, tuple)): callback = [callback] subtasks = node._get_child_tasks() callback += subtasks kwargs = dict(callback=callback, **kwargs) _task = func.subtask(args=args, kwargs=kwargs) tasks.append(_task) taskset = TaskSet(tasks) result = taskset.apply_async() return result
def reindex_addons(index=None, addon_type=None): from . import tasks ids = Addon.unfiltered.values_list('id', flat=True) if addon_type: ids = ids.filter(type=addon_type) ts = [tasks.index_addons.subtask(args=[chunk], kwargs=dict(index=index)) for chunk in chunked(sorted(list(ids)), 150)] TaskSet(ts).apply_async()
def test_apply_async(self): applied = [0] class mocksubtask(subtask): def apply_async(self, *args, **kwargs): applied[0] += 1 ts = TaskSet([mocksubtask(MockTask, (i, i)) for i in (2, 4, 8)]) ts.apply_async() self.assertEqual(applied[0], 3) class Publisher(object): def send(self, *args, **kwargs): pass ts.apply_async(publisher=Publisher())
def test_apply_async(self): applied = [0] class mocksubtask(Signature): def apply_async(self, *args, **kwargs): applied[0] += 1 ts = TaskSet([mocksubtask(MockTask, (i, i)) for i in (2, 4, 8)]) ts.apply_async() self.assertEqual(applied[0], 3) class Publisher(object): def send(self, *args, **kwargs): pass ts.apply_async(publisher=Publisher()) # setting current_task @current_app.task def xyz(): pass from celery.app.state import _tls _tls.current_task = xyz try: ts.apply_async(publisher=Publisher()) finally: _tls.current_task = None xyz.request.clear()
def run(self, flavor, repo_path, data): backend = load_backend(flavor, repo_path, cache=False) out = StringIO() proto = ReceivableProtocol(StringIO(data).read, out.write) handler = _ReceivePackHandler(WebBackend(), [backend], proto, stateless_rpc=True) handler.handle() sync_tasks = [] for oldrev, newrev, name in handler._good_refs: if name.startswith('refs/heads/'): branch = name[11:] sync_tasks.append(subtask(SyncTask, args=[backend.flavor, backend.path, oldrev, newrev, branch])) if sync_tasks: taskset = TaskSet(tasks=sync_tasks) taskset.apply_async().join() return out.getvalue(), handler._good_refs
def run_jobs_update(category_parser = CategoryParser, jobs_parser = JobsParser): # allow mocking of parsers Province(search_id = 1, name = 'All').save() Province(search_id = 2, name = 'Gauteng').save() Province(search_id = 5, name = 'Western Cape').save() Province(search_id = 6, name = 'KZN').save() Province(search_id = -1, name = 'Limpopo').save() Province(search_id = -2, name = 'Mpumalanga').save() Province(search_id = -3, name = 'Free State').save() Province(search_id = -4, name = 'Northern Cape').save() Province(search_id = -5, name = 'Eastern Cape').save() Province(search_id = -6, name = 'North West').save() now = datetime.now() taskset = TaskSet(queue_categories.subtask((province.search_id, category_parser, jobs_parser, ), options = {'eta':now + timedelta(seconds=10 * i)}) for i, province in enumerate(Province.objects.all()) if province.search_id > 0) return taskset.apply_async()
def post_save_handler( sender, instance=False, **kwargs): try: logging.debug("Prestoprime POST save handler %s %s" % (instance, instance.file)) # Record Additional Usage if instance.name is not None and not instance.initial_usage_recorded and instance.name.endswith(".mxf"): logging.debug("Prestoprime POST save handler %s id:'%s' " % (instance,instance.id)) mxfframecounttask = mxfframecount.subtask([[instance.id],[]]) #count = result["frames"] #logging.info("Recording ingest usage %s " % (count)) #usage_store.record(instance.id,pp_mxfframe_ingested_metric,count) job = Job(name="Prestoprime Tasks",mfile=instance) job.save() output = JobOutput(name="Job Output",job=job,mimetype="text/plain") output.save() ## Do Post Processing mfiled10check = MFileD10Check(mfile=instance) temp_handle = StringIO() temp_handle.seek(0) suf = SimpleUploadedFile("checkfile", temp_handle.read()) mfiled10check.checkfile.save(suf.name+'.txt', suf, save=False) mfiled10check.save() d10mxfchecksumtask = d10mxfchecksum.subtask([[instance.id],[output]]) ts = TaskSet(tasks=[d10mxfchecksumtask,mxfframecounttask]) tsr = ts.apply_async() tsr.save() job.taskset_id=tsr.taskset_id job.save() logging.debug("Prestoprime task %s " % task ) except Exception as e: logging.error("Prestoprime POST save handler failed %s " % e)
def _chunk_queryset_into_tasks(self, items, count, chunksize=5000, bundle_size=250): """Chunks the queryset passed in, and dispatches it to Celery for adding to the index. Potential performance improvements: - Postgres is quiescent when Solr is popping tasks from Celery, instead, it should be fetching the next 1,000 """ processed_count = 0 subtasks = [] item_bundle = [] for item in items: last_item = (count == processed_count + 1) if self.verbosity >= 2: self.stdout.write('Indexing item %s' % item.pk) item_bundle.append(item) if (len(item_bundle) >= bundle_size) or last_item: # Every bundle_size documents we create a subtask subtasks.append( add_or_update_items.subtask((item_bundle, self.solr_url)) ) item_bundle = [] processed_count += 1 if (len(subtasks) >= chunksize) or last_item: # Every chunksize items, we send the subtasks for processing job = TaskSet(tasks=subtasks) job.apply_async().join() subtasks = [] if (processed_count % 50000 == 0) or last_item: # Do a commit every 50000 items, for good measure. self.stdout.write("...running commit command...") self.si.commit() sys.stdout.write("\rProcessed {}/{} ({:.0%})".format( processed_count, count, processed_count * 1.0 / count, )) self.stdout.flush() self.stdout.write('\n')