Ejemplo n.º 1
0
def next_hit_in(domain, gap=GAP, callback=None):
    """
    Gives the number of seconds until the next time we can hit a given domain.
    Returns '0' if we haven't hit it in [gap] seconds.
    """
    if domain == 'rocwiki.org':
        # We know we can handle the traffic.  :-)
        gap = 1

    mc = memcache.Client(MEMCACHE)
    keyname = __name__ + '_hittime_' + domain
    keyname = keyname.encode('ascii', 'ignore')
    result = 0
    now = int(time.time())
    last_hit = mc.get(keyname)

    if last_hit:
        result = gap - int(time.time()-last_hit)

    if result < 1:
        result = 0
        mc.set(keyname, now, time=now+gap)

    if callback is not None:
        subtask(callback).delay(result)
    return result
Ejemplo n.º 2
0
def extractfragment(inputs,outputs,options={},callbacks=[]):
    try:
	mfileid=inputs[0]
	videopath=_get_mfile(mfileid)

	tempout=tempfile.NamedTemporaryFile(suffix=".mp4")
	logging.info("temp file: %s" % tempout.name)

	intime=options["intime"]
	fragmentlength=options["fragmentlength"]

	# extract 'fragmentlength' video fragment starting at 'intime' (seconds)
	# ffmpeg -ss 00:00:30.0 -t 00:00:10.0 -i input.wmv -acodec copy -vcodec copy -async 1 output.wmv
	args = ["ffmpeg -y -ss",intime,"-t",fragmentlength,"-i",videopath,"-acodec copy -vcodec copy -async 1",tempout.name]
	cmd = " ".join(args)
	logging.info(cmd)
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, close_fds=True)
	(stdout,stderr) = p.communicate()
	logging.info(stdout)

	if p.returncode != 0:
		raise Exception("Command %s exited with code %d. Stderr: %s" % (cmd, p.returncode, stderr))

        # make job outputs available
        _save_joboutput(outputs[0], tempout)

        for callback in callbacks:
            subtask(callback).delay()

        return {"success":True, "message":"extractfragment successful"}
    except Exception as e:
        logging.info("Error with extractfragment %s." % e)
        raise e
Ejemplo n.º 3
0
def _unlock_chord(setid, callback, interval=1, max_retries=None):
    result = TaskSetResult.restore(setid)
    if result.ready():
        subtask(callback).delay(result.join())
        result.delete()
    else:
        _unlock_chord.retry(countdown=interval, max_retries=max_retries)
Ejemplo n.º 4
0
def thumboutput(inputs,outputs,options={},callbacks=[]):

    try:
        inputid = inputs[0]

        widthS = options["width"]
        heightS = options["height"]
        height = int(heightS)
        width  = int(widthS)

        from jobservice.models import JobOutput
        jo = JobOutput.objects.get(pk=inputid)
        path = jo.file.path

        logging.info("Creating %sx%s image for %s" % (width,height,inputid))

        image = _thumbimage(path,width,height)

        if image:

            if not _save_joboutput_thumb(inputid,image):
                thumboutput.retry([inputs,outputs,options,callbacks])

            logging.info("Thumbnail created %s" % (image))

            for callback in callbacks:
                subtask(callback).delay()

            return {"success":True,"message":"Thumbnail '%sx%s' successful"%(width,height)}
        else:
            raise Exception("Could not create image")

    except Exception as e:
        logging.info("Error with thumbimage %s" % e)
        raise e
Ejemplo n.º 5
0
def import_sizes(provider_id, callback=None, **kwargs):
    logger = import_sizes.get_logger(**kwargs)
    prov = Provider.objects.get(id=provider_id)
    logger.debug('Importing sizes for provider %s...' % prov)
    prov.import_sizes()
    if callback:
        subtask(callback).delay(provider_id)
Ejemplo n.º 6
0
def pluck_links_from_text(text, callback=None):
    """
    Given a string, returns a list of linkinfo dicts.
    Calls back on each link if callback is set.
    """
    result = []
    for candidate in re.finditer("\[[^]]*\]", text):
        if candidate.group().startswith('[http'):
            # we have a link!
            bunch = candidate.group().strip('[]').split(' ', 1)
            link_url = bunch[0]
            if len(bunch) == 1:
                link_text = ''
            else:
                link_text = bunch[1]

            linkinfo = {
                'url': link_url,
                'text': link_text,
            }

            if callback is not None:
                subtask(callback).delay(linkinfo)
            result.append(linkinfo)

    return result
Ejemplo n.º 7
0
def check_robot_ok(url, callback=None):
    """
    Checks to see if we can crawl the url in question.
    """
    urlp = urlparse.urlparse(url)

    mc = memcache.Client(MEMCACHE)
    keyname = __name__ + '_robotstxt_' + urlp.netloc
    keyname = keyname.encode('ascii', 'ignore')
    robotstxt = mc.get(keyname)

    if not robotstxt: 
        # No robots.txt on file within the past 24 hours; get one.
        url = urlparse.urljoin(urlp.scheme + '://' + urlp.netloc, 'robots.txt')
        robotstxt, headers = fetch_url(url)

        mc.set(keyname, robotstxt, time=time.time()+86400)

    # Use robotparser to evaluate the situation.
    rp = robotparser.RobotFileParser()
    rp.parse(robotstxt)
    result = rp.can_fetch(USERAGENT, url)

    if callback is not None:
        subtask(callback).delay(result)
    return result
Ejemplo n.º 8
0
 def run(self, url, download_parent, file_number, **kwargs):
     self.debug_prefix = str(download_parent.id) + "_" + str(file_number)
     logger = logging.getLogger('ohdei.downloader.downloader.run')
     # Celery docs say this is deprecated and self.request.id should be used
     # but I can't get it to work...
     self.kwargs = kwargs
     if file_number == 1:
         logger.debug("%s: main file for Download %d" % (self.debug_prefix, download_parent.id, ))
     file = File(task_id=kwargs["task_id"], url=url, download_parent=download_parent, file_number=file_number)
     file.save()
     logger.debug("%s: filename: %s, redirected_url: %s" % (self.debug_prefix, file.filename, file.redirected_url,))
     ret = self._download(file.filename, file.redirected_url)
     # this is hacky FIXME
     if ret:
         logger.debug("%s: download was aborted" % self.debug_prefix)
         return True
     elif ret is False:
         logger.debug("%s: download had an error" % self.debug_prefix)
         return False
     if file.is_html and file_number == 1: #FIXME not only on the first download
         logger.debug("%s: parsing HTML for images, css, etc" % self.debug_prefix)
         try:
             with open(file.filename, "r+") as temp:
                 soup = BeautifulSoup.BeautifulSoup(temp)
                 links, soup = self._parse(soup, file.redirected_url, file.download_parent)
                 temp.seek(0)
                 temp.write(str(soup))
                 for k, v in links.iteritems():
                     logger.debug("%s: launching download subtask %d: %s" % (self.debug_prefix, v, k, ))
                     subtask("ohdei.downloader.downloader.Downloader", url=k, download_parent=download_parent, file_number=v).delay()
         except HTMLParser.HTMLParseError, e:
             logger.debug("%s: error parsing HTML: %s" % (self.debug_prefix, e.value, ))
             return True
         logger.debug("%s: finished parsing HTML, all done" % self.debug_prefix)
Ejemplo n.º 9
0
def md5fileverify(inputs,outputs,options={},callbacks=[]):

    """Return hex md5 digest for a Django FieldFile"""
    try:
        mfileid = inputs[0]
        from dataservice.models import MFile
        mf = MFile.objects.get(id=mfileid)
        path = _get_mfile(mfileid)
        file = open(path,'r')
        md5 = hashlib.md5()
        while True:
            data = file.read(8192)  # multiple of 128 bytes is best
            if not data:
                break
            md5.update(data)
        file.close()
        calculated_md5 = md5.hexdigest()

        logging.info("Verify MD5 calclated %s" % calculated_md5)

        from dataservice.models import MFile
        _mf = MFile.objects.get(id=mfileid)
        db_md5 = _mf.checksum

        if db_md5 != calculated_md5:
            raise Exception("MD5 Verification Failed")

        for callback in callbacks:
            subtask(callback).delay()

        return {"message":"Verification of '%s' successful %s=%s" % (mf,db_md5,calculated_md5), "md5" : calculated_md5  }

    except Exception as e:
        logging.info("Error with mime %s" % e)
        raise e
Ejemplo n.º 10
0
def ffmbc(inputs,outputs,options={},callbacks=[]):
    try:
	mfileid=inputs[0]
	videopath=_get_mfile(mfileid)

	tempout=tempfile.NamedTemporaryFile()
	logging.info("temp file: %s" % tempout.name)

	ffmpeg_args=options["args"]

	# extract all I frames that are no closer than 5 seconds apart
	args = ["ffmbc -y -i",videopath,ffmpeg_args,tempout.name]
	cmd = " ".join(args)
	logging.info(cmd)
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, close_fds=True)
	(stdout,stderr) = p.communicate()
	logging.info(stdout)

	if p.returncode != 0:
		raise Exception("Command %s exited with code %d. Stderr: %s" % (cmd, p.returncode, stderr))

        # make job outputs available
        _save_joboutput(outputs[0], tempout)

        for callback in callbacks:
            subtask(callback).delay()

        return {"success":True, "message":"ffmbc successful"}
    except Exception as e:
        logging.info("Error with ffmbc %s." % e)
        raise e
Ejemplo n.º 11
0
def mimefile(inputs,outputs,options={},callbacks=[]):
    try:
        mfileid = inputs[0]
        path = _get_mfile(mfileid)
        m = magic.open(magic.MAGIC_MIME)
        m.load()

        upath = path.encode("utf-8")
        result = m.file(upath)
        mimetype = result.split(';')[0]

        from dataservice.models import MFile
        mf = MFile.objects.get(id=mfileid)
        mf.mimetype = mimetype
        mf.save()

        for callback in callbacks:
            logging.info("Mimefile callback - "% callback)
            subtask(callback).delay()

        return {"success":True,"message":"Mime detection successful", "mimetype" : mimetype}
    except Exception as e:
        logging.info("Error with mime %s" % e)
        import sys
        import traceback
        traceback.print_exc(file=sys.stdout)
        raise e
    def on_success(self, retval, task_id, *args, **kwargs):
        """When the urls are retrieved they will imported
        in 'urls' table and it will execute the check with a specific task.
        """
        session_id = args[0][0].id
        out_directory = retval[1]
        hash_ = retval[0]

        # 1. get the session
        session = DBSession.query(ValidationSessionModel).filter(
            ValidationSessionModel.id == session_id
        ).one()
        # 2. change session status
        session.status = 2

        # 2.a create all tables or clean them
        create_or_clean_tables(session.code)

        # 3. Rebuild urls table
        urls_model = get_urls_model(session.code)
        fp = open('/'.join((out_directory, 'pages.csv')))
        for url in fp:
            record = urls_model(url=url.strip())
            DBSession.add(record)
        fp.close()
        transaction.commit()

        # 4. run validation subtask
        checking = CheckTask()
        subtask(checking).delay(hash_, session_id)
Ejemplo n.º 13
0
def send_email(user, content, preview=False, callback=None):
    from_email = "Sorbet <*****@*****.**>"
    to_email = user.email

    if preview:
        subject = u"Sorbet preview for {0}".format(content.title)
        template = "feedmanager/email/feed_preview.html"
        items = content.item_set.order_by("-pubdate")[:5]
        context = {"feed": content, "items": items}
    else:
        if len(content) < 1:
            raise AssertionError("send_email called but no feeds passed")
        subject = u"Feed Updates from Sorbet"
        template = "feedmanager/email/new_items.html"
        context = {"feeds": content}

    html_content = render_to_string(template, context)
    text_content = strip_tags(html_content)

    msg = EmailMultiAlternatives(subject, text_content, from_email, [to_email])
    msg.attach_alternative(html_content, "text/html")
    msg.send()

    if callback:
        subtask(callback).delay()
Ejemplo n.º 14
0
def d10mxfchecksum(inputs,outputs,options={},callbacks=[]):
    try:
        mfileid = inputs[0]
        joboutput = outputs[0]

        inputfile = _get_mfile(mfileid)
        outputfile = tempfile.NamedTemporaryFile()

        logging.info("Processing d10mxfchecksum job on %s" % (inputfile))

        if not os.path.exists(inputfile):
            logging.info("Inputfile  %s does not exist" % (inputfile))
            return False

        args = ["d10sumchecker","-i",inputfile,"-o",outputfile.name]

        ret = subprocess.call(args)

        if ret != 0:
            raise Exception("d10mxfchecksum failed")

        outputfile.seek(0)
        suf = SimpleUploadedFile("mfile",outputfile.read(), content_type='text/plain')

        from jobservice.models import JobOutput
        jo = JobOutput.objects.get(id=joboutput)
        jo.file.save('d10mxfchecksum.txt', suf, save=True)

        for callback in callbacks:
            subtask(callback).delay()

        return {"success":True,"message":"d10mxfchecksum successful"}
    except Exception as e:
        logging.info("Error with d10mxfchecksum %s" % e)
        raise e
Ejemplo n.º 15
0
def import_provider_info(provider_id, **kwargs):
    logger = import_provider_info.get_logger(**kwargs)
    prov = Provider.objects.get(id=provider_id)
    logger.debug('Importing info for provider %s...' % prov)
    import_images.delay(provider_id, callback=subtask(import_locations,
                                callback=subtask(import_sizes,
                                    callback=subtask(import_nodes))))
Ejemplo n.º 16
0
def md5file(inputs,outputs,options={},callbacks=[]):

    """Return hex md5 digest for a Django FieldFile"""
    try:
        mfileid = inputs[0]
        path = _get_mfile(mfileid)
        file = open(path,'r')
        md5 = hashlib.md5()
        while True:
            data = file.read(8192)  # multiple of 128 bytes is best
            if not data:
                break
            md5.update(data)
        file.close()
        md5string = md5.hexdigest()
        logging.info("MD5 calclated %s" % (md5string ))

        from dataservice.models import MFile
        _mf = MFile.objects.get(id=mfileid)
        _mf.checksum = md5string
        _mf.save()

        for callback in callbacks:
            logging.info("Running Callback %s" % callback)
            subtask(callback).delay()

        return {"success":True,"message":"MD5 successful", "md5" : md5string}
    except Exception, e:
        logging.info("Error with md5 %s" % e)
        raise
Ejemplo n.º 17
0
def sha1file(inputs,outputs,options={},callbacks=[]):

    """Return hex sha1 digest for a Django FieldFile"""
    try:
        mfileid = inputs[0]
        path = _get_mfile(mfileid)
        file = open(path,'r')
        sha1 = hashlib.sha1()
        while True:
            data = file.read(8192)  # multiple of 128 bytes is best
            if not data:
                break
            sha1.update(data)
        file.close()
        sha1string = sha1.hexdigest()
        logging.info("SHA1 calculated %s" % (sha1string))

	# TODO: move to dataservice and store checksum in file?
        #from dataservice.models import MFile
        #_mf = MFile.objects.get(id=mfileid)
        #_mf.checksum = md5string
        #_mf.save()

        for callback in callbacks:
            logging.info("Running Callback %s" % callback)
            subtask(callback).delay()

        return {"success":True,"message":"SHA1 successful", "sha1" : sha1string}
    except Exception, e:
        logging.info("Error with sha1 %s" % e)
        raise e
Ejemplo n.º 18
0
def find_links(doc_id,  doc_callback=None, callback_for_doc_callback=None,
               links_callback=None, callback_for_links_callback=None):
    link_single_re = re.compile(r"<a[^>]+href='([^']+)'")
    link_double_re = re.compile(r'<a[^>]+href="([^"]+)"')

    doc = models.Page.load(settings.DB, doc_id)
    if doc is None or not len(doc.content):
        return

    raw_links = set()

    try:
        for match in link_single_re.finditer(doc.content):
            raw_links.add(match.group(1))
        for match in link_double_re.finditer(doc.content):
            raw_links.add(match.group(1))
    except TypeError:
        # Content is not a string
        pass

    doc.links = []
    parseable_links = []
    parse = urlparse.urlparse(doc['url'])

    for link in raw_links:
        possible_paths = []
        if link.startswith('#') or link.startswith("//"):
            continue
        elif link.startswith('http://') or link.startswith('https://'):
            pass
        elif link.startswith('/'):
            possible_paths = parse.path.split('/')[:-1]
        else:
            link = '/' + link
            possible_paths = parse.path.split('/')[:-1]

        link, parseable = check(iri_to_uri(link.split("#")[0]), parse,
                                possible_paths)
        link and doc.links.append(link)
        if parseable:
            parseable_links.append(link)

    doc.store(settings.DB)

    if doc_callback is not None:
        subtask(doc_callback).delay(doc.id, callback=callback_for_doc_callback)

    for link in parseable_links:
        page = models.Page.get_by_url(link, update=False)
        if page is None and not links_callback is None:
            # Do I need a substask or task here?
            links_callback.delay(link, callback=callback_for_links_callback)
        elif not doc_callback is None:
            subtask(doc_callback).delay(page.id,
                callback=callback_for_doc_callback)
    else:
        # Useful for testing
        if links_callback is None:
            return doc.links, parseable_links
Ejemplo n.º 19
0
 def unlock_chord(setid, callback, interval=1, propagate=False,
         max_retries=None, result=None):
     result = _res.TaskSetResult(setid, map(_res.AsyncResult, result))
     j = result.join_native if result.supports_native_join else result.join
     if result.ready():
         subtask(callback).delay(j(propagate=propagate))
     else:
         unlock_chord.retry(countdown=interval, max_retries=max_retries)
Ejemplo n.º 20
0
def _unlock_chord(setid, callback, interval=1, propagate=False,
        max_retries=None):
    result = TaskSetResult.restore(setid)
    if result.ready():
        subtask(callback).delay(result.join(propagate=propagate))
        result.delete()
    else:
        _unlock_chord.retry(countdown=interval, max_retries=max_retries)
Ejemplo n.º 21
0
 def on_chord_part_return(self, task, keyprefix="chord-unlock-%s"):
     setid = task.request.taskset
     key = keyprefix % setid
     deps = TaskSetResult.restore(setid, backend=task.backend)
     if self.client.incr(key) >= deps.total:
         subtask(task.request.chord).delay(deps.join())
         deps.delete()
     self.client.expire(key, 86400)
Ejemplo n.º 22
0
 def on_chord_part_return(self, task, keyprefix="chord-unlock-%s"):
     setid = task.request.taskset
     key = keyprefix % setid
     deps = TaskSetResult.restore(setid, backend=task.backend)
     if self.client.incr(key) >= deps.total:
         subtask(task.request.chord).delay(deps.join())
         deps.delete()
     self.client.expire(key, 86400)
Ejemplo n.º 23
0
def deploy(hosts, callback=puppet_run):
    puppet_dir = r'/pxeinstall/puppet/files'
    if isinstance(hosts, dict):
        #printlog(run_cmd("sudo sed -i 's/NODE/transfer/g' /etc/puppet/manifests/site.pp"))
        #for host in hosts["cc"] + hosts["nc"]:
        #    subtask(callback).delay(host)
        #printlog(run_cmd("sudo sed -i 's/transfer/NODE/g' /etc/puppet/manifests/site.pp"))
        #printlog("copy config files. ")
        #if os.path.exists(conf_path("localrc")):
            #clean the former config files
        #    try:
        #        clean_dir(puppet_folder)
        #        os.rename(conf_path("localrc"),os.path.join(puppet_folder,'localrc'))
        #    except OSError, err:
        #        pass
        #        printlog("Failed to move localrc to puppet folder with err %s " % err)
            #p = run_cmd("sudo cp %s /pxeinstall/puppet/files" % conf_path("localrc"))
            #printlog(p.communicate()[0])
        #else:
        #    printlog("localrc does not exist.")
        
        #if os.path.exists(conf_path("localnc")):
        #    p = run_cmd("sudo cp %s /pxeinstall/puppet/files" % abs_path("localnc"))
        #    printlog(p.communicate()[0])
        #else:
        #    printlog("localnc does not exist.")
        puppet_path = lambda hostname: os.path.join(puppet_dir,hostname)

        if hosts.has_key("cc"):
            printlog("copy config files... ")
#            clean_dir(puppet_dir)
#            transfer_configs(conf_path(hosts["cc"][0]),puppet_path(hosts["cc"][0]))
            
            printlog("add node cc to deploy.pp")
            if checkfile(hosts["cc"][0],r"/etc/puppet/manifests/deploy.pp") == -1:
                printlog(run_cmd("sudo echo -e node \"'%s.sh.intel.com'\" '{ \n  include deploy\n}' >> /etc/puppet/manifests/deploy.pp" % hosts["cc"][0]))

            ret = subtask(callback).delay(hosts["cc"][0])
            time.sleep(250)
            if ret.ready():
                printlog("succeeded to deploy cc.")
            else:
                printlog("failed to deploy cc on host %s " % hosts["cc"][0])
        
        if hosts.has_key("nc"):
            for host in hosts["nc"]:
                printlog("copy config file localnc_%s. " % host)
#                transfer_configs(conf_path(host),puppet_path(host))

                if checkfile(host,r"/etc/puppet/manifests/deploy.pp") == -1:
                    printlog(run_cmd("sudo echo -e node \"'%s.sh.intel.com'\" '{ \n  include deploy\n}' >> /etc/puppet/manifests/deploy.pp" % host))
                ret = subtask(callback).delay(host)
                if ret.ready():
                    printlog("succeeded to deploy nc on host %s." % host)
                else:
                    printlog("failed to deploy nc on host %s. " % host)
    else:
        printlog("hosts dict is incorrect. ")
Ejemplo n.º 24
0
def fetch_contents(url, callback):
    
    contents, real_url = fetch_url(url)
    
    # stick the contents into the cache
    cache.put_contents(url, contents, real_url)
    
    # TODO: in celery 2.6, this is unecessary - see http://ask.github.com/celery/whatsnew-2.6.html#group-chord-chain-are-now-subtasks
    subtask(callback).delay(contents, real_url)
Ejemplo n.º 25
0
def import_provider_info(provider_id, **kwargs):
    logger = import_provider_info.get_logger(**kwargs)
    prov = Provider.objects.get(id=provider_id)
    logger.debug('Importing info for provider %s...' % prov)
    import_images.delay(provider_id,
                        callback=subtask(import_locations,
                                         callback=subtask(
                                             import_sizes,
                                             callback=subtask(import_nodes))))
Ejemplo n.º 26
0
def decompress(content, extension='bz2', callback=None):
    "Decompresses a string.  Currently only does bzip2."
    if extension == 'bz2':
        decompress.update_state(state="UNBZIP2")
        out = bz2.decompress(content)
    else:
        out = content
    if callback is not None:
        subtask(callback).delay(out)
    return out
Ejemplo n.º 27
0
def compute_hazard_curve(job_id, site_list, realization, callback=None):
    """ Generate hazard curve for a given site list. """
    hazengine = job.Job.from_kvs(job_id)
    with mixins.Mixin(hazengine, hazjob.HazJobMixin, key="hazard"):
        keys = hazengine.compute_hazard_curve(site_list, realization)

        if callback:
            subtask(callback).delay(job_id, site_list)

        return keys
Ejemplo n.º 28
0
def page_parser(url, depth=0):
	print 'Task {0} starts parsing : {1}'.format(depth, url)
	parser = WLParser()
	r = requests.get(url)
	page = r.text
	parser.feed(page)
	print 'Task {0}: {1} links found'.format(depth, len(parser.links))

	if (depth < 3):
		subtask(page_parser).delay(url, depth+1)
Ejemplo n.º 29
0
def retrieve_page(url, callback=None):
    page = models.Page.get_by_url(url)
    if page is None or page.id is None:
        return

    if not callback is None:
        subtask(callback).delay(page.id, links_callback=retrieve_page,
                     callback_for_links_callback=find_links,
                     doc_callback=calculate_rank,
                     callback_for_doc_callback=calculate_rank)
Ejemplo n.º 30
0
def page_parser(url, depth=0):
    print 'Task {0} starts parsing : {1}'.format(depth, url)
    parser = WLParser()
    r = requests.get(url)
    page = r.text
    parser.feed(page)
    print 'Task {0}: {1} links found'.format(depth, len(parser.links))

    if (depth < 3):
        subtask(page_parser).delay(url, depth + 1)
Ejemplo n.º 31
0
def compute_hazard_curve(job_id, site_list, realization, callback=None):
    """ Generate hazard curve for a given site list. """
    hazengine = job.Job.from_kvs(job_id)
    with mixins.Mixin(hazengine, hazjob.HazJobMixin, key="hazard"):
        keys = hazengine.compute_hazard_curve(site_list, realization)

        if callback:
            subtask(callback).delay(job_id, site_list)

        return keys
Ejemplo n.º 32
0
 def on_chord_part_return(self, task, propagate=False,
         keyprefix="chord-unlock-%s"):
     from celery.task.sets import subtask
     from celery.result import TaskSetResult
     setid = task.request.taskset
     key = keyprefix % setid
     deps = TaskSetResult.restore(setid, backend=task.backend)
     if self.client.incr(key) >= deps.total:
         subtask(task.request.chord).delay(deps.join(propagate=propagate))
         deps.delete()
     self.client.expire(key, 86400)
Ejemplo n.º 33
0
def fetch(user_id, url, host, callback=None):
    try:
        video = _fetcher.fetch(user_id, url, host, fetch.get_logger())
        if callback is not None:
            subtask(callback).delay(video)

    except UrlNotSupported:
        pass

    except Exception, exc:
        fetch.retry(exc=exc)
Ejemplo n.º 34
0
def f(cbs, cb, x):
    if x == 0:
        bV = cbs.pop()
        bVf = bV['func']
        bVV = bV['val']
        return subtask(bVf).delay(cbs, cb, bVV)
    else:
        cbs.append({
            'func':g,
            'val':x
        })
        return subtask(f).delay(cbs, cb, x-1)
Ejemplo n.º 35
0
def fibo(cbs, x):
    if x == 0 or x== 1:
        bV = cbs.pop()
        bVf = bV['func']
        bVV = bV['val']
        subtask(bVf).delay(cbs, bVV)
    else:
        cbs.append({
            'func':gibo,
            'val':x-1
        })
        subtask(fibo).delay(cbs, x-1)
Ejemplo n.º 36
0
 def on_chord_part_return(self,
                          task,
                          propagate=False,
                          keyprefix="chord-unlock-%s"):
     from celery.task.sets import subtask
     from celery.result import TaskSetResult
     setid = task.request.taskset
     key = keyprefix % setid
     deps = TaskSetResult.restore(setid, backend=task.backend)
     if self.client.incr(key) >= deps.total:
         subtask(task.request.chord).delay(deps.join(propagate=propagate))
         deps.delete()
     self.client.expire(key, 86400)
Ejemplo n.º 37
0
def get_new_emails():
    """ Read new emails from an email server, and schedule them for delivery to Indivo.

    Parsing of the emails is handled in the subtask (deliver_email_to_indivo()).

    """

    # TODO
    logger = get_new_emails.get_logger()
    logger.info('getting new emails...')
    emails = ['a', 'b', 'c']

    # Schedule a task to deliver each message to Indivo
    for email in emails:
        subtask(deliver_email_to_indivo).delay(email)
Ejemplo n.º 38
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert',
                         site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{
         'q': 'supreme',
         'caller': 'scraper_test',
     }).execute()
     count = response.result.numFound
     self.assertEqual(
         count, 1,
         "There were %s items found when there should have been 1" % count)
Ejemplo n.º 39
0
def extract_from_pdf(doc, path, DEVNULL, callback=None):
    """ Extract text from pdfs.

    Here, we use pdftotext. If that fails, try to use tesseract under the
    assumption it's an image-based PDF. Once that is complete, we check for the
    letter e in our content. If it's not there, we try to fix the mojibake
    that ca9 sometimes creates.
    """
    process = subprocess.Popen(
        ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"],
        shell=False,
        stdout=subprocess.PIPE,
        stderr=DEVNULL)
    content, err = process.communicate()
    if content.strip() == '' and callback:
        # probably an image PDF. Send it to OCR
        result = subtask(callback).delay(path)
        success, content = result.get()
        if success:
            doc.extracted_by_ocr = True
        elif content == '' or not success:
            content = 'Unable to extract document content.'
    elif 'e' not in content:
        # It's a corrupt PDF from ca9. Fix it.
        content = fix_mojibake(unicode(content, 'utf-8', errors='ignore'))

    return doc, content, err
Ejemplo n.º 40
0
    def run(self, flavor, repo_path, data):
        backend = load_backend(flavor, repo_path, cache=False)
        out = StringIO()
        proto = ReceivableProtocol(StringIO(data).read, out.write)
        handler = _ReceivePackHandler(WebBackend(), [backend],
                                      proto,
                                      stateless_rpc=True)
        handler.handle()

        sync_tasks = []
        for oldrev, newrev, name in handler._good_refs:
            if name.startswith('refs/heads/'):
                branch = name[11:]
                sync_tasks.append(
                    subtask(SyncTask,
                            args=[
                                backend.flavor, backend.path, oldrev, newrev,
                                branch
                            ]))

        if sync_tasks:
            taskset = TaskSet(tasks=sync_tasks)
            taskset.apply_async().join()

        return out.getvalue(), handler._good_refs
Ejemplo n.º 41
0
 def save(self, *args, **kwargs):
     from tasks import encode_media, upload_media
     if not self.id:
         self.file_type = "audio"
     super(Audio, self).save(*args, **kwargs)
     if self.encode and (not self.encoded):
         encode_media.delay(self.id, callback=subtask(upload_media))
Ejemplo n.º 42
0
def rebuild_repo(spec):
    from celery.task.sets import subtask

    from .models import BuildTask
    from irgsh_repo.tasks import RebuildRepo

    package = spec.package
    dist = spec.distribution.repo
    pkgdist = package.packagedistribution_set.get(distribution=dist)

    tasks = BuildTask.objects.filter(specification=spec) \
                             .filter(status=999) \
                             .select_related()
    task_arch_list = [(task.task_id, task.architecture.name) for task in tasks]

    task_name = RebuildRepo.name
    args = [
        spec.id, package.name, spec.version, dist.name, pkgdist.component.name,
        task_arch_list, spec.section, spec.priority
    ]
    kwargs = None
    opts = {
        'exchange': 'repo',
        'exchange_type': 'direct',
        'routing_key': 'repo'
    }

    s = subtask(task_name, args, kwargs, opts)
    return s.apply_async()
Ejemplo n.º 43
0
def check_sync(route_name=None, selected_routes=[]):
    from flowspec.models import Route, MatchPort, MatchDscp, ThenAction
    if not selected_routes:
        routes = Route.objects.all()
    else:
        routes = selected_routes
    if route_name:
        routes = routes.filter(name=route_name)
    for route in routes:
        if route.has_expired() and (route.status != 'EXPIRED' and route.status != 'ADMININACTIVE' and route.status != 'INACTIVE'):
            if route.status != 'ERROR':
                logger.info('Expiring %s route %s' %(route.status, route.name))
                subtask(delete).delay(route, reason="EXPIRED")
        else:
            if route.status != 'EXPIRED':
                route.check_sync()
Ejemplo n.º 44
0
def extract_all_docs(docs):
    num_docs = docs.count()
    if num_docs == 0:
        print "Nothing to parse for this court."
    else:
        print "%s documents in this court." % (num_docs, )
        for doc in docs:
            extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr))
Ejemplo n.º 45
0
    def run(self, update_image_info, update_article_info, callback=None):
        image_instance_key = generate_image_instance_key(
            update_article_info.article_id, update_image_info.image_url)
        try:
            create_myimage_instance(update_article_info.user_id,
                                    image_instance_key,
                                    update_image_info.image_url,
                                    update_article_info.article_id)
        except Exception:
            MarkImagetobedoneHandler.delay(update_image_info,
                                           update_article_info)
        else:
            update_image_info.image_instance_key = image_instance_key
            #            call next step
            subtask(callback).delay(update_image_info, update_article_info)

        return None
Ejemplo n.º 46
0
 def test_is_JSON_serializable(self):
     s = MockTask.subtask((2, ), {"cache": True},
             {"routing_key": "CPU-bound"})
     s.args = list(s.args)                   # tuples are not preserved
                                             # but this doesn't matter.
     self.assertEqual(s,
                      subtask(anyjson.deserialize(
                          anyjson.serialize(s))))
Ejemplo n.º 47
0
 def test_is_JSON_serializable(self):
     s = MockTask.subtask(
         (2, ),
         {'cache': True},
         {'routing_key': 'CPU-bound'},
     )
     s.args = list(s.args)  # tuples are not preserved
     # but this doesn't matter.
     self.assertEqual(s, subtask(anyjson.loads(anyjson.dumps(s))))
Ejemplo n.º 48
0
 def save(self, make_thumbnail=True, *args, **kwargs):
     from tasks import encode_media, generate_thumbnail, upload_media
     if not self.id:
         self.file_type = "video"
     super(Video, self).save(*args, **kwargs)
     if self.encode and (not self.encoded):
         #encode then upload
         encode_media.delay(self.id, callback=subtask(upload_media))
     if self.auto_thumbnail and make_thumbnail:
         generate_thumbnail.delay(self.id)
Ejemplo n.º 49
0
    def save(self, *args, **kwargs):
        super(Video, self).save(*args, **kwargs)

        from transcode.tasks import encode_video
        from transcode.tasks import upload_file
        if self.encode_status == 0:
            encode_video.delay(self.id, callback=subtask(upload_file))
        #if self.transfer_status == 0:
        #    print self.upload_cmd
        #    upload_file.delay(self.id)
        """
Ejemplo n.º 50
0
def get_new_emails():
    """ Read new emails from an email server, and schedule them for delivery to Indivo.

    Parsing of the emails is handled in the subtask (deliver_email_to_indivo()).

    """

    # TODO
    logger = get_new_emails.get_logger()
    logger.info('connecting to the mail server...')
    conn = mail_server_connect()

    try:
        logger.info('getting new emails...')
        typ, message_id_list = conn.search(None, 'UNSEEN')
        if typ != 'OK':
            raise MailServerException("Error reading new messages: %s" % message_id_list[0])
        
        message_ids = [m for m in message_id_list[0].split(" ") if m]
        logger.info('%s new messages found' % len(message_ids))
        for m_id in message_ids:
            logger.info('fetching message with id %s' % m_id)
            typ, msg_data = conn.fetch(m_id, '(RFC822)')
            if typ != 'OK':
                raise MailServerException("Error fetching message %s: %s" % (m_id, msg_data[0]))

            parsed_email = email.message_from_string(msg_data[0][1])
            if deliver_email_p(parsed_email):
                
                # Schedule a task to deliver the message to Indivo                
                logger.info('New email! scheduling for delivery...') 
                subtask(deliver_email_to_indivo).delay(parsed_email)

            else:
                logger.warning('Rejecting message from %s: Not in approved senders list'% parsed_email.get('From', ''))
    except conn.error as e:
        logger.error(str(e))
    finally:
        logger.info('disconnecting from mail server...')
        mail_server_disconnect(conn)
Ejemplo n.º 51
0
def publish_entry(video_id, callback=None):
    video = Video.objects.get(pk=video_id)
    data = {}
    #for field, value in video:
    #    data[field] = value
    data['title'] = video.title
    data['url'] = video.file
    data['publisher'] = video.uploader
    data['pubdate'] = video.upload_datetime

    url = conf.PUBLISH_URL

    headers = {'User-Agent': 'test'}

    r = requests.post(url, data, headers=headers)

    if r.text == "0":
        video.publish_status = True
        video.save()

    if callback:
        subtask(callback).delay(video.id)
Ejemplo n.º 52
0
def fetch_document(url, useragent, return_html=False):
    try:
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', useragent)]
        response = opener.open(url)
        html = response.read()
        links = subtask(extract_urls).apply_async([(url, html)])
        # avoid filling memory with useless html if we don't want it
        if return_html:
            return (url, html, links, len(html))
        return (url, "", links, len(html))
    except:
        return (url, "", FakeAsyncResult(result=set()), 0)
Ejemplo n.º 53
0
def fixer(simulate=False, verbose=False):
    """OCR documents that lack content"""
    # docs = queryset_generator(Document.objects.filter(source='C', plain_text=''))
    # docs = Document.objects.raw('''select "pk"  from "Document" where "source" = 'C' and "plain_text" ~ '^[[:space:]]*$' ''')
    docs = Document.objects.raw(
        """select "pk" from "Document" where "source" = 'C' and "plain_text" = 'Unable to extract document content.' """
    )
    for doc in docs:
        if verbose:
            print "Fixing document number %s: %s" % (doc.pk, doc)

        if not simulate:
            # Extract the contents asynchronously.
            extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
Ejemplo n.º 54
0
 def test_content_extraction(self):
     """Do all of the supported mimetypes get extracted to text
     successfully, including OCR?"""
     test_strings = [
         'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
         'fidelity'
     ]
     opinions = Opinion.objects.all()
     for op, test_string in zip(opinions, test_strings):
         ext = get_extension(op.local_path.file.read())
         op = extract_doc_content(op.pk, callback=subtask(extract_by_ocr))
         if ext in ['.html', '.wpd']:
             self.assertIn(test_string, op.html.lower())
         else:
             self.assertIn(test_string, op.plain_text.lower())
Ejemplo n.º 55
0
def monitor_workflow( instance, connection, interval=5.0 ):
    '''
    Run and monitor a test workflow in Galaxy. 
    '''

    # create library and history
    library_id = connection.create_library(Site.objects.get_current().name + " Test Library - " + str( datetime.now() ) )    
    history_id = connection.create_history(Site.objects.get_current().name + " Test History - " + str( datetime.now() ) )
    
    workflow_task = subtask( run_workflow ).delay( instance, connection, library_id, history_id )
    
    while True:
        progress = connection.get_progress( history_id )
        monitor_workflow.update_state( state="PROGRESS", meta=progress )
        print  "Sleeping ..."
        time.sleep( interval );
        print  "Awake ..."
        print  "Workflow Task State: " + workflow_task.state + "\n"
        print  "Workflow State: " + progress["workflow_state"] + "\n"
                
        if workflow_task.state == "SUCCESS":
            print "Workflow task finished successfully."
            
            if progress["workflow_state"] == "ok":
                print "Workflow finished successfully. Stopping monitor ..."
                break

            if progress["workflow_state"] == "error":
                print "Workflow failed. Stopping monitor ..."
                break
             
            if progress["workflow_state"] == "queued":
                print "Workflow running."

            if progress["workflow_state"] == "new":
                print "Workflow being prepared."

        
        if workflow_task.state == "FAILURE":
            print "Workflow task failed . Stopping monitor ..."
            break
    
    # return the final state information  
    return progress
Ejemplo n.º 56
0
    def encode_again(self, request, queryset):
        rows_updated = 0
        for media in queryset:
            if media.encode:
                rows_updated += 1
                encode_media.delay(media.id, callback=subtask(upload_media))
                media.encoded = False
                media.uploaded = False
                media.encoding = True
                media.save()
        if rows_updated == 1:
            message_bit = "Your file is"
        elif rows_updated > 1:
            message_bit = "Your files are"

        if rows_updated > 0:
            messages.success(
                request,
                "%s being encoded and uploaded.  An email notification will be sent when complete."
                % message_bit)
Ejemplo n.º 57
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = [
            'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
            'fidelity'
        ]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert',
                                site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(
                case_name=site.case_names[i],
                court=self.court,
            )
            docket.save()
            doc = Document(
                date_filed=site.case_dates[i],
                citation=cite,
                docket=docket,
            )
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Ejemplo n.º 58
0
 def test_task_argument_can_be_task_cls(self):
     s = subtask(MockTask, (2, 2))
     self.assertEqual(s.task, MockTask.name)
Ejemplo n.º 59
0
 def test_behaves_like_type(self):
     s = subtask("tasks.add", (2, 2), {"cache": True},
                 {"routing_key": "CPU-bound"})
     self.assertDictEqual(subtask(s), s)
Ejemplo n.º 60
0
def subtask(*args, **kwargs):
    from celery.task.sets import subtask
    return subtask(*args, **kwargs)