def next_hit_in(domain, gap=GAP, callback=None): """ Gives the number of seconds until the next time we can hit a given domain. Returns '0' if we haven't hit it in [gap] seconds. """ if domain == 'rocwiki.org': # We know we can handle the traffic. :-) gap = 1 mc = memcache.Client(MEMCACHE) keyname = __name__ + '_hittime_' + domain keyname = keyname.encode('ascii', 'ignore') result = 0 now = int(time.time()) last_hit = mc.get(keyname) if last_hit: result = gap - int(time.time()-last_hit) if result < 1: result = 0 mc.set(keyname, now, time=now+gap) if callback is not None: subtask(callback).delay(result) return result
def extractfragment(inputs,outputs,options={},callbacks=[]): try: mfileid=inputs[0] videopath=_get_mfile(mfileid) tempout=tempfile.NamedTemporaryFile(suffix=".mp4") logging.info("temp file: %s" % tempout.name) intime=options["intime"] fragmentlength=options["fragmentlength"] # extract 'fragmentlength' video fragment starting at 'intime' (seconds) # ffmpeg -ss 00:00:30.0 -t 00:00:10.0 -i input.wmv -acodec copy -vcodec copy -async 1 output.wmv args = ["ffmpeg -y -ss",intime,"-t",fragmentlength,"-i",videopath,"-acodec copy -vcodec copy -async 1",tempout.name] cmd = " ".join(args) logging.info(cmd) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, close_fds=True) (stdout,stderr) = p.communicate() logging.info(stdout) if p.returncode != 0: raise Exception("Command %s exited with code %d. Stderr: %s" % (cmd, p.returncode, stderr)) # make job outputs available _save_joboutput(outputs[0], tempout) for callback in callbacks: subtask(callback).delay() return {"success":True, "message":"extractfragment successful"} except Exception as e: logging.info("Error with extractfragment %s." % e) raise e
def _unlock_chord(setid, callback, interval=1, max_retries=None): result = TaskSetResult.restore(setid) if result.ready(): subtask(callback).delay(result.join()) result.delete() else: _unlock_chord.retry(countdown=interval, max_retries=max_retries)
def thumboutput(inputs,outputs,options={},callbacks=[]): try: inputid = inputs[0] widthS = options["width"] heightS = options["height"] height = int(heightS) width = int(widthS) from jobservice.models import JobOutput jo = JobOutput.objects.get(pk=inputid) path = jo.file.path logging.info("Creating %sx%s image for %s" % (width,height,inputid)) image = _thumbimage(path,width,height) if image: if not _save_joboutput_thumb(inputid,image): thumboutput.retry([inputs,outputs,options,callbacks]) logging.info("Thumbnail created %s" % (image)) for callback in callbacks: subtask(callback).delay() return {"success":True,"message":"Thumbnail '%sx%s' successful"%(width,height)} else: raise Exception("Could not create image") except Exception as e: logging.info("Error with thumbimage %s" % e) raise e
def import_sizes(provider_id, callback=None, **kwargs): logger = import_sizes.get_logger(**kwargs) prov = Provider.objects.get(id=provider_id) logger.debug('Importing sizes for provider %s...' % prov) prov.import_sizes() if callback: subtask(callback).delay(provider_id)
def pluck_links_from_text(text, callback=None): """ Given a string, returns a list of linkinfo dicts. Calls back on each link if callback is set. """ result = [] for candidate in re.finditer("\[[^]]*\]", text): if candidate.group().startswith('[http'): # we have a link! bunch = candidate.group().strip('[]').split(' ', 1) link_url = bunch[0] if len(bunch) == 1: link_text = '' else: link_text = bunch[1] linkinfo = { 'url': link_url, 'text': link_text, } if callback is not None: subtask(callback).delay(linkinfo) result.append(linkinfo) return result
def check_robot_ok(url, callback=None): """ Checks to see if we can crawl the url in question. """ urlp = urlparse.urlparse(url) mc = memcache.Client(MEMCACHE) keyname = __name__ + '_robotstxt_' + urlp.netloc keyname = keyname.encode('ascii', 'ignore') robotstxt = mc.get(keyname) if not robotstxt: # No robots.txt on file within the past 24 hours; get one. url = urlparse.urljoin(urlp.scheme + '://' + urlp.netloc, 'robots.txt') robotstxt, headers = fetch_url(url) mc.set(keyname, robotstxt, time=time.time()+86400) # Use robotparser to evaluate the situation. rp = robotparser.RobotFileParser() rp.parse(robotstxt) result = rp.can_fetch(USERAGENT, url) if callback is not None: subtask(callback).delay(result) return result
def run(self, url, download_parent, file_number, **kwargs): self.debug_prefix = str(download_parent.id) + "_" + str(file_number) logger = logging.getLogger('ohdei.downloader.downloader.run') # Celery docs say this is deprecated and self.request.id should be used # but I can't get it to work... self.kwargs = kwargs if file_number == 1: logger.debug("%s: main file for Download %d" % (self.debug_prefix, download_parent.id, )) file = File(task_id=kwargs["task_id"], url=url, download_parent=download_parent, file_number=file_number) file.save() logger.debug("%s: filename: %s, redirected_url: %s" % (self.debug_prefix, file.filename, file.redirected_url,)) ret = self._download(file.filename, file.redirected_url) # this is hacky FIXME if ret: logger.debug("%s: download was aborted" % self.debug_prefix) return True elif ret is False: logger.debug("%s: download had an error" % self.debug_prefix) return False if file.is_html and file_number == 1: #FIXME not only on the first download logger.debug("%s: parsing HTML for images, css, etc" % self.debug_prefix) try: with open(file.filename, "r+") as temp: soup = BeautifulSoup.BeautifulSoup(temp) links, soup = self._parse(soup, file.redirected_url, file.download_parent) temp.seek(0) temp.write(str(soup)) for k, v in links.iteritems(): logger.debug("%s: launching download subtask %d: %s" % (self.debug_prefix, v, k, )) subtask("ohdei.downloader.downloader.Downloader", url=k, download_parent=download_parent, file_number=v).delay() except HTMLParser.HTMLParseError, e: logger.debug("%s: error parsing HTML: %s" % (self.debug_prefix, e.value, )) return True logger.debug("%s: finished parsing HTML, all done" % self.debug_prefix)
def md5fileverify(inputs,outputs,options={},callbacks=[]): """Return hex md5 digest for a Django FieldFile""" try: mfileid = inputs[0] from dataservice.models import MFile mf = MFile.objects.get(id=mfileid) path = _get_mfile(mfileid) file = open(path,'r') md5 = hashlib.md5() while True: data = file.read(8192) # multiple of 128 bytes is best if not data: break md5.update(data) file.close() calculated_md5 = md5.hexdigest() logging.info("Verify MD5 calclated %s" % calculated_md5) from dataservice.models import MFile _mf = MFile.objects.get(id=mfileid) db_md5 = _mf.checksum if db_md5 != calculated_md5: raise Exception("MD5 Verification Failed") for callback in callbacks: subtask(callback).delay() return {"message":"Verification of '%s' successful %s=%s" % (mf,db_md5,calculated_md5), "md5" : calculated_md5 } except Exception as e: logging.info("Error with mime %s" % e) raise e
def ffmbc(inputs,outputs,options={},callbacks=[]): try: mfileid=inputs[0] videopath=_get_mfile(mfileid) tempout=tempfile.NamedTemporaryFile() logging.info("temp file: %s" % tempout.name) ffmpeg_args=options["args"] # extract all I frames that are no closer than 5 seconds apart args = ["ffmbc -y -i",videopath,ffmpeg_args,tempout.name] cmd = " ".join(args) logging.info(cmd) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, close_fds=True) (stdout,stderr) = p.communicate() logging.info(stdout) if p.returncode != 0: raise Exception("Command %s exited with code %d. Stderr: %s" % (cmd, p.returncode, stderr)) # make job outputs available _save_joboutput(outputs[0], tempout) for callback in callbacks: subtask(callback).delay() return {"success":True, "message":"ffmbc successful"} except Exception as e: logging.info("Error with ffmbc %s." % e) raise e
def mimefile(inputs,outputs,options={},callbacks=[]): try: mfileid = inputs[0] path = _get_mfile(mfileid) m = magic.open(magic.MAGIC_MIME) m.load() upath = path.encode("utf-8") result = m.file(upath) mimetype = result.split(';')[0] from dataservice.models import MFile mf = MFile.objects.get(id=mfileid) mf.mimetype = mimetype mf.save() for callback in callbacks: logging.info("Mimefile callback - "% callback) subtask(callback).delay() return {"success":True,"message":"Mime detection successful", "mimetype" : mimetype} except Exception as e: logging.info("Error with mime %s" % e) import sys import traceback traceback.print_exc(file=sys.stdout) raise e
def on_success(self, retval, task_id, *args, **kwargs): """When the urls are retrieved they will imported in 'urls' table and it will execute the check with a specific task. """ session_id = args[0][0].id out_directory = retval[1] hash_ = retval[0] # 1. get the session session = DBSession.query(ValidationSessionModel).filter( ValidationSessionModel.id == session_id ).one() # 2. change session status session.status = 2 # 2.a create all tables or clean them create_or_clean_tables(session.code) # 3. Rebuild urls table urls_model = get_urls_model(session.code) fp = open('/'.join((out_directory, 'pages.csv'))) for url in fp: record = urls_model(url=url.strip()) DBSession.add(record) fp.close() transaction.commit() # 4. run validation subtask checking = CheckTask() subtask(checking).delay(hash_, session_id)
def send_email(user, content, preview=False, callback=None): from_email = "Sorbet <*****@*****.**>" to_email = user.email if preview: subject = u"Sorbet preview for {0}".format(content.title) template = "feedmanager/email/feed_preview.html" items = content.item_set.order_by("-pubdate")[:5] context = {"feed": content, "items": items} else: if len(content) < 1: raise AssertionError("send_email called but no feeds passed") subject = u"Feed Updates from Sorbet" template = "feedmanager/email/new_items.html" context = {"feeds": content} html_content = render_to_string(template, context) text_content = strip_tags(html_content) msg = EmailMultiAlternatives(subject, text_content, from_email, [to_email]) msg.attach_alternative(html_content, "text/html") msg.send() if callback: subtask(callback).delay()
def d10mxfchecksum(inputs,outputs,options={},callbacks=[]): try: mfileid = inputs[0] joboutput = outputs[0] inputfile = _get_mfile(mfileid) outputfile = tempfile.NamedTemporaryFile() logging.info("Processing d10mxfchecksum job on %s" % (inputfile)) if not os.path.exists(inputfile): logging.info("Inputfile %s does not exist" % (inputfile)) return False args = ["d10sumchecker","-i",inputfile,"-o",outputfile.name] ret = subprocess.call(args) if ret != 0: raise Exception("d10mxfchecksum failed") outputfile.seek(0) suf = SimpleUploadedFile("mfile",outputfile.read(), content_type='text/plain') from jobservice.models import JobOutput jo = JobOutput.objects.get(id=joboutput) jo.file.save('d10mxfchecksum.txt', suf, save=True) for callback in callbacks: subtask(callback).delay() return {"success":True,"message":"d10mxfchecksum successful"} except Exception as e: logging.info("Error with d10mxfchecksum %s" % e) raise e
def import_provider_info(provider_id, **kwargs): logger = import_provider_info.get_logger(**kwargs) prov = Provider.objects.get(id=provider_id) logger.debug('Importing info for provider %s...' % prov) import_images.delay(provider_id, callback=subtask(import_locations, callback=subtask(import_sizes, callback=subtask(import_nodes))))
def md5file(inputs,outputs,options={},callbacks=[]): """Return hex md5 digest for a Django FieldFile""" try: mfileid = inputs[0] path = _get_mfile(mfileid) file = open(path,'r') md5 = hashlib.md5() while True: data = file.read(8192) # multiple of 128 bytes is best if not data: break md5.update(data) file.close() md5string = md5.hexdigest() logging.info("MD5 calclated %s" % (md5string )) from dataservice.models import MFile _mf = MFile.objects.get(id=mfileid) _mf.checksum = md5string _mf.save() for callback in callbacks: logging.info("Running Callback %s" % callback) subtask(callback).delay() return {"success":True,"message":"MD5 successful", "md5" : md5string} except Exception, e: logging.info("Error with md5 %s" % e) raise
def sha1file(inputs,outputs,options={},callbacks=[]): """Return hex sha1 digest for a Django FieldFile""" try: mfileid = inputs[0] path = _get_mfile(mfileid) file = open(path,'r') sha1 = hashlib.sha1() while True: data = file.read(8192) # multiple of 128 bytes is best if not data: break sha1.update(data) file.close() sha1string = sha1.hexdigest() logging.info("SHA1 calculated %s" % (sha1string)) # TODO: move to dataservice and store checksum in file? #from dataservice.models import MFile #_mf = MFile.objects.get(id=mfileid) #_mf.checksum = md5string #_mf.save() for callback in callbacks: logging.info("Running Callback %s" % callback) subtask(callback).delay() return {"success":True,"message":"SHA1 successful", "sha1" : sha1string} except Exception, e: logging.info("Error with sha1 %s" % e) raise e
def find_links(doc_id, doc_callback=None, callback_for_doc_callback=None, links_callback=None, callback_for_links_callback=None): link_single_re = re.compile(r"<a[^>]+href='([^']+)'") link_double_re = re.compile(r'<a[^>]+href="([^"]+)"') doc = models.Page.load(settings.DB, doc_id) if doc is None or not len(doc.content): return raw_links = set() try: for match in link_single_re.finditer(doc.content): raw_links.add(match.group(1)) for match in link_double_re.finditer(doc.content): raw_links.add(match.group(1)) except TypeError: # Content is not a string pass doc.links = [] parseable_links = [] parse = urlparse.urlparse(doc['url']) for link in raw_links: possible_paths = [] if link.startswith('#') or link.startswith("//"): continue elif link.startswith('http://') or link.startswith('https://'): pass elif link.startswith('/'): possible_paths = parse.path.split('/')[:-1] else: link = '/' + link possible_paths = parse.path.split('/')[:-1] link, parseable = check(iri_to_uri(link.split("#")[0]), parse, possible_paths) link and doc.links.append(link) if parseable: parseable_links.append(link) doc.store(settings.DB) if doc_callback is not None: subtask(doc_callback).delay(doc.id, callback=callback_for_doc_callback) for link in parseable_links: page = models.Page.get_by_url(link, update=False) if page is None and not links_callback is None: # Do I need a substask or task here? links_callback.delay(link, callback=callback_for_links_callback) elif not doc_callback is None: subtask(doc_callback).delay(page.id, callback=callback_for_doc_callback) else: # Useful for testing if links_callback is None: return doc.links, parseable_links
def unlock_chord(setid, callback, interval=1, propagate=False, max_retries=None, result=None): result = _res.TaskSetResult(setid, map(_res.AsyncResult, result)) j = result.join_native if result.supports_native_join else result.join if result.ready(): subtask(callback).delay(j(propagate=propagate)) else: unlock_chord.retry(countdown=interval, max_retries=max_retries)
def _unlock_chord(setid, callback, interval=1, propagate=False, max_retries=None): result = TaskSetResult.restore(setid) if result.ready(): subtask(callback).delay(result.join(propagate=propagate)) result.delete() else: _unlock_chord.retry(countdown=interval, max_retries=max_retries)
def on_chord_part_return(self, task, keyprefix="chord-unlock-%s"): setid = task.request.taskset key = keyprefix % setid deps = TaskSetResult.restore(setid, backend=task.backend) if self.client.incr(key) >= deps.total: subtask(task.request.chord).delay(deps.join()) deps.delete() self.client.expire(key, 86400)
def deploy(hosts, callback=puppet_run): puppet_dir = r'/pxeinstall/puppet/files' if isinstance(hosts, dict): #printlog(run_cmd("sudo sed -i 's/NODE/transfer/g' /etc/puppet/manifests/site.pp")) #for host in hosts["cc"] + hosts["nc"]: # subtask(callback).delay(host) #printlog(run_cmd("sudo sed -i 's/transfer/NODE/g' /etc/puppet/manifests/site.pp")) #printlog("copy config files. ") #if os.path.exists(conf_path("localrc")): #clean the former config files # try: # clean_dir(puppet_folder) # os.rename(conf_path("localrc"),os.path.join(puppet_folder,'localrc')) # except OSError, err: # pass # printlog("Failed to move localrc to puppet folder with err %s " % err) #p = run_cmd("sudo cp %s /pxeinstall/puppet/files" % conf_path("localrc")) #printlog(p.communicate()[0]) #else: # printlog("localrc does not exist.") #if os.path.exists(conf_path("localnc")): # p = run_cmd("sudo cp %s /pxeinstall/puppet/files" % abs_path("localnc")) # printlog(p.communicate()[0]) #else: # printlog("localnc does not exist.") puppet_path = lambda hostname: os.path.join(puppet_dir,hostname) if hosts.has_key("cc"): printlog("copy config files... ") # clean_dir(puppet_dir) # transfer_configs(conf_path(hosts["cc"][0]),puppet_path(hosts["cc"][0])) printlog("add node cc to deploy.pp") if checkfile(hosts["cc"][0],r"/etc/puppet/manifests/deploy.pp") == -1: printlog(run_cmd("sudo echo -e node \"'%s.sh.intel.com'\" '{ \n include deploy\n}' >> /etc/puppet/manifests/deploy.pp" % hosts["cc"][0])) ret = subtask(callback).delay(hosts["cc"][0]) time.sleep(250) if ret.ready(): printlog("succeeded to deploy cc.") else: printlog("failed to deploy cc on host %s " % hosts["cc"][0]) if hosts.has_key("nc"): for host in hosts["nc"]: printlog("copy config file localnc_%s. " % host) # transfer_configs(conf_path(host),puppet_path(host)) if checkfile(host,r"/etc/puppet/manifests/deploy.pp") == -1: printlog(run_cmd("sudo echo -e node \"'%s.sh.intel.com'\" '{ \n include deploy\n}' >> /etc/puppet/manifests/deploy.pp" % host)) ret = subtask(callback).delay(host) if ret.ready(): printlog("succeeded to deploy nc on host %s." % host) else: printlog("failed to deploy nc on host %s. " % host) else: printlog("hosts dict is incorrect. ")
def fetch_contents(url, callback): contents, real_url = fetch_url(url) # stick the contents into the cache cache.put_contents(url, contents, real_url) # TODO: in celery 2.6, this is unecessary - see http://ask.github.com/celery/whatsnew-2.6.html#group-chord-chain-are-now-subtasks subtask(callback).delay(contents, real_url)
def import_provider_info(provider_id, **kwargs): logger = import_provider_info.get_logger(**kwargs) prov = Provider.objects.get(id=provider_id) logger.debug('Importing info for provider %s...' % prov) import_images.delay(provider_id, callback=subtask(import_locations, callback=subtask( import_sizes, callback=subtask(import_nodes))))
def decompress(content, extension='bz2', callback=None): "Decompresses a string. Currently only does bzip2." if extension == 'bz2': decompress.update_state(state="UNBZIP2") out = bz2.decompress(content) else: out = content if callback is not None: subtask(callback).delay(out) return out
def compute_hazard_curve(job_id, site_list, realization, callback=None): """ Generate hazard curve for a given site list. """ hazengine = job.Job.from_kvs(job_id) with mixins.Mixin(hazengine, hazjob.HazJobMixin, key="hazard"): keys = hazengine.compute_hazard_curve(site_list, realization) if callback: subtask(callback).delay(job_id, site_list) return keys
def page_parser(url, depth=0): print 'Task {0} starts parsing : {1}'.format(depth, url) parser = WLParser() r = requests.get(url) page = r.text parser.feed(page) print 'Task {0}: {1} links found'.format(depth, len(parser.links)) if (depth < 3): subtask(page_parser).delay(url, depth+1)
def retrieve_page(url, callback=None): page = models.Page.get_by_url(url) if page is None or page.id is None: return if not callback is None: subtask(callback).delay(page.id, links_callback=retrieve_page, callback_for_links_callback=find_links, doc_callback=calculate_rank, callback_for_doc_callback=calculate_rank)
def page_parser(url, depth=0): print 'Task {0} starts parsing : {1}'.format(depth, url) parser = WLParser() r = requests.get(url) page = r.text parser.feed(page) print 'Task {0}: {1} links found'.format(depth, len(parser.links)) if (depth < 3): subtask(page_parser).delay(url, depth + 1)
def on_chord_part_return(self, task, propagate=False, keyprefix="chord-unlock-%s"): from celery.task.sets import subtask from celery.result import TaskSetResult setid = task.request.taskset key = keyprefix % setid deps = TaskSetResult.restore(setid, backend=task.backend) if self.client.incr(key) >= deps.total: subtask(task.request.chord).delay(deps.join(propagate=propagate)) deps.delete() self.client.expire(key, 86400)
def fetch(user_id, url, host, callback=None): try: video = _fetcher.fetch(user_id, url, host, fetch.get_logger()) if callback is not None: subtask(callback).delay(video) except UrlNotSupported: pass except Exception, exc: fetch.retry(exc=exc)
def f(cbs, cb, x): if x == 0: bV = cbs.pop() bVf = bV['func'] bVV = bV['val'] return subtask(bVf).delay(cbs, cb, bVV) else: cbs.append({ 'func':g, 'val':x }) return subtask(f).delay(cbs, cb, x-1)
def fibo(cbs, x): if x == 0 or x== 1: bV = cbs.pop() bVf = bV['func'] bVV = bV['val'] subtask(bVf).delay(cbs, bVV) else: cbs.append({ 'func':gibo, 'val':x-1 }) subtask(fibo).delay(cbs, x-1)
def get_new_emails(): """ Read new emails from an email server, and schedule them for delivery to Indivo. Parsing of the emails is handled in the subtask (deliver_email_to_indivo()). """ # TODO logger = get_new_emails.get_logger() logger.info('getting new emails...') emails = ['a', 'b', 'c'] # Schedule a task to deliver each message to Indivo for email in emails: subtask(deliver_email_to_indivo).delay(email)
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{ 'q': 'supreme', 'caller': 'scraper_test', }).execute() count = response.result.numFound self.assertEqual( count, 1, "There were %s items found when there should have been 1" % count)
def extract_from_pdf(doc, path, DEVNULL, callback=None): """ Extract text from pdfs. Here, we use pdftotext. If that fails, try to use tesseract under the assumption it's an image-based PDF. Once that is complete, we check for the letter e in our content. If it's not there, we try to fix the mojibake that ca9 sometimes creates. """ process = subprocess.Popen( ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"], shell=False, stdout=subprocess.PIPE, stderr=DEVNULL) content, err = process.communicate() if content.strip() == '' and callback: # probably an image PDF. Send it to OCR result = subtask(callback).delay(path) success, content = result.get() if success: doc.extracted_by_ocr = True elif content == '' or not success: content = 'Unable to extract document content.' elif 'e' not in content: # It's a corrupt PDF from ca9. Fix it. content = fix_mojibake(unicode(content, 'utf-8', errors='ignore')) return doc, content, err
def run(self, flavor, repo_path, data): backend = load_backend(flavor, repo_path, cache=False) out = StringIO() proto = ReceivableProtocol(StringIO(data).read, out.write) handler = _ReceivePackHandler(WebBackend(), [backend], proto, stateless_rpc=True) handler.handle() sync_tasks = [] for oldrev, newrev, name in handler._good_refs: if name.startswith('refs/heads/'): branch = name[11:] sync_tasks.append( subtask(SyncTask, args=[ backend.flavor, backend.path, oldrev, newrev, branch ])) if sync_tasks: taskset = TaskSet(tasks=sync_tasks) taskset.apply_async().join() return out.getvalue(), handler._good_refs
def save(self, *args, **kwargs): from tasks import encode_media, upload_media if not self.id: self.file_type = "audio" super(Audio, self).save(*args, **kwargs) if self.encode and (not self.encoded): encode_media.delay(self.id, callback=subtask(upload_media))
def rebuild_repo(spec): from celery.task.sets import subtask from .models import BuildTask from irgsh_repo.tasks import RebuildRepo package = spec.package dist = spec.distribution.repo pkgdist = package.packagedistribution_set.get(distribution=dist) tasks = BuildTask.objects.filter(specification=spec) \ .filter(status=999) \ .select_related() task_arch_list = [(task.task_id, task.architecture.name) for task in tasks] task_name = RebuildRepo.name args = [ spec.id, package.name, spec.version, dist.name, pkgdist.component.name, task_arch_list, spec.section, spec.priority ] kwargs = None opts = { 'exchange': 'repo', 'exchange_type': 'direct', 'routing_key': 'repo' } s = subtask(task_name, args, kwargs, opts) return s.apply_async()
def check_sync(route_name=None, selected_routes=[]): from flowspec.models import Route, MatchPort, MatchDscp, ThenAction if not selected_routes: routes = Route.objects.all() else: routes = selected_routes if route_name: routes = routes.filter(name=route_name) for route in routes: if route.has_expired() and (route.status != 'EXPIRED' and route.status != 'ADMININACTIVE' and route.status != 'INACTIVE'): if route.status != 'ERROR': logger.info('Expiring %s route %s' %(route.status, route.name)) subtask(delete).delay(route, reason="EXPIRED") else: if route.status != 'EXPIRED': route.check_sync()
def extract_all_docs(docs): num_docs = docs.count() if num_docs == 0: print "Nothing to parse for this court." else: print "%s documents in this court." % (num_docs, ) for doc in docs: extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr))
def run(self, update_image_info, update_article_info, callback=None): image_instance_key = generate_image_instance_key( update_article_info.article_id, update_image_info.image_url) try: create_myimage_instance(update_article_info.user_id, image_instance_key, update_image_info.image_url, update_article_info.article_id) except Exception: MarkImagetobedoneHandler.delay(update_image_info, update_article_info) else: update_image_info.image_instance_key = image_instance_key # call next step subtask(callback).delay(update_image_info, update_article_info) return None
def test_is_JSON_serializable(self): s = MockTask.subtask((2, ), {"cache": True}, {"routing_key": "CPU-bound"}) s.args = list(s.args) # tuples are not preserved # but this doesn't matter. self.assertEqual(s, subtask(anyjson.deserialize( anyjson.serialize(s))))
def test_is_JSON_serializable(self): s = MockTask.subtask( (2, ), {'cache': True}, {'routing_key': 'CPU-bound'}, ) s.args = list(s.args) # tuples are not preserved # but this doesn't matter. self.assertEqual(s, subtask(anyjson.loads(anyjson.dumps(s))))
def save(self, make_thumbnail=True, *args, **kwargs): from tasks import encode_media, generate_thumbnail, upload_media if not self.id: self.file_type = "video" super(Video, self).save(*args, **kwargs) if self.encode and (not self.encoded): #encode then upload encode_media.delay(self.id, callback=subtask(upload_media)) if self.auto_thumbnail and make_thumbnail: generate_thumbnail.delay(self.id)
def save(self, *args, **kwargs): super(Video, self).save(*args, **kwargs) from transcode.tasks import encode_video from transcode.tasks import upload_file if self.encode_status == 0: encode_video.delay(self.id, callback=subtask(upload_file)) #if self.transfer_status == 0: # print self.upload_cmd # upload_file.delay(self.id) """
def get_new_emails(): """ Read new emails from an email server, and schedule them for delivery to Indivo. Parsing of the emails is handled in the subtask (deliver_email_to_indivo()). """ # TODO logger = get_new_emails.get_logger() logger.info('connecting to the mail server...') conn = mail_server_connect() try: logger.info('getting new emails...') typ, message_id_list = conn.search(None, 'UNSEEN') if typ != 'OK': raise MailServerException("Error reading new messages: %s" % message_id_list[0]) message_ids = [m for m in message_id_list[0].split(" ") if m] logger.info('%s new messages found' % len(message_ids)) for m_id in message_ids: logger.info('fetching message with id %s' % m_id) typ, msg_data = conn.fetch(m_id, '(RFC822)') if typ != 'OK': raise MailServerException("Error fetching message %s: %s" % (m_id, msg_data[0])) parsed_email = email.message_from_string(msg_data[0][1]) if deliver_email_p(parsed_email): # Schedule a task to deliver the message to Indivo logger.info('New email! scheduling for delivery...') subtask(deliver_email_to_indivo).delay(parsed_email) else: logger.warning('Rejecting message from %s: Not in approved senders list'% parsed_email.get('From', '')) except conn.error as e: logger.error(str(e)) finally: logger.info('disconnecting from mail server...') mail_server_disconnect(conn)
def publish_entry(video_id, callback=None): video = Video.objects.get(pk=video_id) data = {} #for field, value in video: # data[field] = value data['title'] = video.title data['url'] = video.file data['publisher'] = video.uploader data['pubdate'] = video.upload_datetime url = conf.PUBLISH_URL headers = {'User-Agent': 'test'} r = requests.post(url, data, headers=headers) if r.text == "0": video.publish_status = True video.save() if callback: subtask(callback).delay(video.id)
def fetch_document(url, useragent, return_html=False): try: opener = urllib2.build_opener() opener.addheaders = [('User-agent', useragent)] response = opener.open(url) html = response.read() links = subtask(extract_urls).apply_async([(url, html)]) # avoid filling memory with useless html if we don't want it if return_html: return (url, html, links, len(html)) return (url, "", links, len(html)) except: return (url, "", FakeAsyncResult(result=set()), 0)
def fixer(simulate=False, verbose=False): """OCR documents that lack content""" # docs = queryset_generator(Document.objects.filter(source='C', plain_text='')) # docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" ~ '^[[:space:]]*$' ''') docs = Document.objects.raw( """select "pk" from "Document" where "source" = 'C' and "plain_text" = 'Unable to extract document content.' """ ) for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) if not simulate: # Extract the contents asynchronously. extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] opinions = Opinion.objects.all() for op, test_string in zip(opinions, test_strings): ext = get_extension(op.local_path.file.read()) op = extract_doc_content(op.pk, callback=subtask(extract_by_ocr)) if ext in ['.html', '.wpd']: self.assertIn(test_string, op.html.lower()) else: self.assertIn(test_string, op.plain_text.lower())
def monitor_workflow( instance, connection, interval=5.0 ): ''' Run and monitor a test workflow in Galaxy. ''' # create library and history library_id = connection.create_library(Site.objects.get_current().name + " Test Library - " + str( datetime.now() ) ) history_id = connection.create_history(Site.objects.get_current().name + " Test History - " + str( datetime.now() ) ) workflow_task = subtask( run_workflow ).delay( instance, connection, library_id, history_id ) while True: progress = connection.get_progress( history_id ) monitor_workflow.update_state( state="PROGRESS", meta=progress ) print "Sleeping ..." time.sleep( interval ); print "Awake ..." print "Workflow Task State: " + workflow_task.state + "\n" print "Workflow State: " + progress["workflow_state"] + "\n" if workflow_task.state == "SUCCESS": print "Workflow task finished successfully." if progress["workflow_state"] == "ok": print "Workflow finished successfully. Stopping monitor ..." break if progress["workflow_state"] == "error": print "Workflow failed. Stopping monitor ..." break if progress["workflow_state"] == "queued": print "Workflow running." if progress["workflow_state"] == "new": print "Workflow being prepared." if workflow_task.state == "FAILURE": print "Workflow task failed . Stopping monitor ..." break # return the final state information return progress
def encode_again(self, request, queryset): rows_updated = 0 for media in queryset: if media.encode: rows_updated += 1 encode_media.delay(media.id, callback=subtask(upload_media)) media.encoded = False media.uploaded = False media.encoding = True media.save() if rows_updated == 1: message_bit = "Your file is" elif rows_updated > 1: message_bit = "Your files are" if rows_updated > 0: messages.success( request, "%s being encoded and uploaded. An email notification will be sent when complete." % message_bit)
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( case_name=site.case_names[i], court=self.court, ) docket.save() doc = Document( date_filed=site.case_dates[i], citation=cite, docket=docket, ) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def test_task_argument_can_be_task_cls(self): s = subtask(MockTask, (2, 2)) self.assertEqual(s.task, MockTask.name)
def test_behaves_like_type(self): s = subtask("tasks.add", (2, 2), {"cache": True}, {"routing_key": "CPU-bound"}) self.assertDictEqual(subtask(s), s)
def subtask(*args, **kwargs): from celery.task.sets import subtask return subtask(*args, **kwargs)