def execute(self, test=False, threaded=False, concurrent=False, scope=None, chunk_size=10, *args, **kwargs): self.target = self.config.driverlicense.collection.data self.gfs = GridFS(self.target.connection[self.target.database]) self.test = test if scope is None: files = self.gfs.list() if concurrent: chunks = [ files[i:i + chunk_size] for i in range(0, len(files), chunk_size) ] self.logger.info("found [%d] files to extract in [%d] chunks", len(files), len(chunks)) for launch in chunks: enqueue(ExtractFacts, scope=launch, test=test) else: self.extract(files, threaded) else: self.extract(scope, threaded)
def execute(self, test=False, threaded=False, concurrent=False, scope=None, chunk_size=10, *args, **kwargs): """ :param test: control, if test don't write data to mongoDB :param threaded: boolean, use multiple threads or not :param concurrent: boolean, launch job multiple times in parallel :param scope: files to be processed :param chunk_size: size of files chunk to be passed to each job if concurrent is true :param args: :param kwargs: :return: """ # define database self.target = self.config.driverlicense.collection.data self.gfs = GridFS(self.target.connection[self.target.database]) self.test =test if scope is None: # list of exsisting files in the database files = self.gfs.list() # define files chuncks if concurrent is true if concurrent: chunks =[files[i:i + chunk_size] for i in range(0, len(files), chunk_size)] self.logger.info("found [%d] files to extract in [%d] chunks", len(files), len(chunks)) # for each chunck of files enqueue the job ProcessFiles for launch in chunks: enqueue(ProcessFiles, scope=launch, test=test) else: self.extract(files, threaded) else: self.extract(scope, threaded)
def execute(self, start=None, end=None, size=None, mid=None, count=None, **kwargs): if size: if self.trial == 1: # launching for i in range(start, end, size): e = i + size if e > end: e = end enqueue(PrimeJob, start=i, end=e, mid=str(self._id), **kwargs) self.defer("waiting") # monitoring running = self.config.sys.queue.count_documents( {"args.mid": str(self._id)}) if running > 0: total = float(math.ceil((end - start + 1) / size)) p = 1. - running / total self.progress(p, "%d of %d running", running, total) self.defer("check %f: waiting for %d of %d jobs to complete", p, running, total) self.logger.info("check: seen all jobs complete") return # calculating coll = self.config.mypro.prime_collection self.set_source(self.started_at.isoformat()) n = 0 for i in range(start, end): if check_prime(i): n += 1 try: coll.insert_one({"_id": i}) except DuplicateKeyError: pass except: raise self.progress(i / end) self.logger.debug("found [%d] primes", n)
def execute(self, count=20, prob_launch=0.5, max_launch=3, min_sleep=3, max_sleep=10, **kwargs): sleep = random.randint(min_sleep, max_sleep) self.logger.info("sleeping [%d] seconds", sleep) time.sleep(sleep) if count > 0: n = random.randint(0, count) for i in range(n): if random.random() <= prob_launch: enqueue(self.__class__, count=count - 2, prob_launch=prob_launch, max_launch=max_launch, id="%s.%d" % (self._id, i))
def execute(self, child=None): count = find_job(name=re.compile("^meetup.example.job.Swarm")) if len(count) > 5: if random.random() > 0.95: self.defer() if random.random() > 0.95: raise RuntimeError() if len(count) > 200: return if child is not None: child = child.split(".") else: child = [0] child.append(0) first = True for i in range(5): if first or random.random() < 0.3: first = False tgt = JOBS[random.randint(0, len(JOBS) - 1)] child[-1] = i enqueue(tgt, child=".".join([str(i) for i in child])) time.sleep(3)
def download(self, test): """ This function retrieves the urls from agof website, and saves the urls and corresponding files in the database :param test: :return: """ # get agof website's content rv = requests.get(url) body = rv.content.decode("utf-8") # extract desired links from the content links = re.findall("href=[\"\'](.+?)[\"\']", body) xls_all = [ href for href in links if href.endswith(".xls") or href.endswith(".xlsx") ] xls = [ filename for filename in xls_all if "Angebote_Ranking" in filename ] self.logger.info("found [%d] xlsx files", len(xls)) download = 0 for link in xls: # check if file already exists in the database doc = self.gfs.find_one({"filename": link}) if doc is None: # if not save the file to mongoDB self.logger.info("download [%s]", link) rv = requests.get(link) if not test: self.gfs.put(rv.content, filename=link) download += 1 self.progress(download / len(xls)) self.logger.info("successfully retrieved [%d] of [%d] files", download, len(xls)) enqueue(ProcessFiles, concurrent=True)
def execute(self, jobs=2, subreddit='europe', **kwargs): reddit = praw.Reddit(client_id=self.class_config.client_id, client_secret=self.class_config.secret, user_agent='core4os') images = [] for submission in reddit.subreddit(subreddit).hot(limit=100): if re.match(r".*(jpg|gif|png)$", submission.url): images.append( (submission.url, submission.title, submission.score)) self.class_config.reddit_coll.insert_one({ "_id": "_control_" + subreddit, "images": images }) for i in range(0, jobs): enqueue(RedditDownloader, master=str(self._id), id=subreddit + str(i), subreddit=subreddit) time.sleep(1)
async def post(self): """ Identify and store prime number in mongo collection ``prime`` from ``start`` to ``end`` using chunks of ``size``. """ start = self.get_argument("start", as_type=int, default=None) end = self.get_argument("end", as_type=int, default=None) size = self.get_argument("size", as_type=int, default=None) if None in (start, end, size): return self.render("templates/prime.html", job_id=None) kwargs = {"start": start, "end": end, "size": size} job = enqueue(PrimeJob, **kwargs) if self.wants_html(): return self.render("templates/prime.html", job_id=str(job._id)) url = await self.reverse_url("JobStream", str(job._id)) return self.redirect(url) # http://devops:5001/core4/api/enter/bc8c3f196df700db3d1420a4d5a4d3b5?start=1&end=10000&size=500&content_type=json
def post(self): job_id = enqueue(SwarmJob) self.reply(job_id._id)
def execute(self, n=5, sleep=5, **kwargs): for i in range(n): enqueue("core4.queue.helper.job.example.DummyJob", sleep=sleep, i=i)