Esempio n. 1
0
 def execute(self,
             test=False,
             threaded=False,
             concurrent=False,
             scope=None,
             chunk_size=10,
             *args,
             **kwargs):
     self.target = self.config.driverlicense.collection.data
     self.gfs = GridFS(self.target.connection[self.target.database])
     self.test = test
     if scope is None:
         files = self.gfs.list()
         if concurrent:
             chunks = [
                 files[i:i + chunk_size]
                 for i in range(0, len(files), chunk_size)
             ]
             self.logger.info("found [%d] files to extract in [%d] chunks",
                              len(files), len(chunks))
             for launch in chunks:
                 enqueue(ExtractFacts, scope=launch, test=test)
         else:
             self.extract(files, threaded)
     else:
         self.extract(scope, threaded)
Esempio n. 2
0
    def execute(self, test=False, threaded=False, concurrent=False, scope=None,
                chunk_size=10, *args, **kwargs):
        """

        :param test: control, if test don't write data to mongoDB
        :param threaded: boolean, use multiple threads or not
        :param concurrent: boolean, launch job multiple times in parallel
        :param scope: files to be processed
        :param chunk_size: size of files chunk to be passed to each job
        if concurrent is true
        :param args:
        :param kwargs:
        :return:
        """
        # define database
        self.target = self.config.driverlicense.collection.data
        self.gfs = GridFS(self.target.connection[self.target.database])
        self.test =test
        if scope is None:
            # list of exsisting files in the database
            files = self.gfs.list()
            # define files chuncks if concurrent is true
            if concurrent:
                chunks =[files[i:i + chunk_size]
                         for i in range(0, len(files), chunk_size)]
                self.logger.info("found [%d] files to extract in [%d] chunks",
                                 len(files), len(chunks))
                # for each chunck of files enqueue the job ProcessFiles
                for launch in chunks:
                    enqueue(ProcessFiles, scope=launch, test=test)
            else:
                self.extract(files, threaded)
        else:
            self.extract(scope, threaded)
Esempio n. 3
0
    def execute(self,
                start=None,
                end=None,
                size=None,
                mid=None,
                count=None,
                **kwargs):
        if size:
            if self.trial == 1:
                # launching
                for i in range(start, end, size):
                    e = i + size
                    if e > end:
                        e = end
                    enqueue(PrimeJob,
                            start=i,
                            end=e,
                            mid=str(self._id),
                            **kwargs)
                self.defer("waiting")
            # monitoring
            running = self.config.sys.queue.count_documents(
                {"args.mid": str(self._id)})
            if running > 0:
                total = float(math.ceil((end - start + 1) / size))
                p = 1. - running / total
                self.progress(p, "%d of %d running", running, total)
                self.defer("check %f: waiting for %d of %d jobs to complete",
                           p, running, total)
            self.logger.info("check: seen all jobs complete")
            return

        # calculating
        coll = self.config.mypro.prime_collection
        self.set_source(self.started_at.isoformat())
        n = 0
        for i in range(start, end):
            if check_prime(i):
                n += 1
                try:
                    coll.insert_one({"_id": i})
                except DuplicateKeyError:
                    pass
                except:
                    raise
            self.progress(i / end)
        self.logger.debug("found [%d] primes", n)
Esempio n. 4
0
 def execute(self,
             count=20,
             prob_launch=0.5,
             max_launch=3,
             min_sleep=3,
             max_sleep=10,
             **kwargs):
     sleep = random.randint(min_sleep, max_sleep)
     self.logger.info("sleeping [%d] seconds", sleep)
     time.sleep(sleep)
     if count > 0:
         n = random.randint(0, count)
         for i in range(n):
             if random.random() <= prob_launch:
                 enqueue(self.__class__,
                         count=count - 2,
                         prob_launch=prob_launch,
                         max_launch=max_launch,
                         id="%s.%d" % (self._id, i))
Esempio n. 5
0
 def execute(self, child=None):
     count = find_job(name=re.compile("^meetup.example.job.Swarm"))
     if len(count) > 5:
         if random.random() > 0.95:
             self.defer()
         if random.random() > 0.95:
             raise RuntimeError()
         if len(count) > 200:
             return
     if child is not None:
         child = child.split(".")
     else:
         child = [0]
     child.append(0)
     first = True
     for i in range(5):
         if first or random.random() < 0.3:
             first = False
             tgt = JOBS[random.randint(0, len(JOBS) - 1)]
             child[-1] = i
             enqueue(tgt, child=".".join([str(i) for i in child]))
     time.sleep(3)
Esempio n. 6
0
    def download(self, test):
        """
        This function retrieves the urls from agof website,
        and saves the urls and corresponding files in the
        database
        :param test:
        :return:
        """
        # get agof website's content
        rv = requests.get(url)
        body = rv.content.decode("utf-8")
        # extract desired links from the content
        links = re.findall("href=[\"\'](.+?)[\"\']", body)
        xls_all = [
            href for href in links
            if href.endswith(".xls") or href.endswith(".xlsx")
        ]
        xls = [
            filename for filename in xls_all if "Angebote_Ranking" in filename
        ]
        self.logger.info("found [%d] xlsx files", len(xls))

        download = 0
        for link in xls:
            # check if file already exists in the database
            doc = self.gfs.find_one({"filename": link})
            if doc is None:
                # if not save the file to mongoDB
                self.logger.info("download [%s]", link)
                rv = requests.get(link)
                if not test:
                    self.gfs.put(rv.content, filename=link)
            download += 1
            self.progress(download / len(xls))
        self.logger.info("successfully retrieved [%d] of [%d] files", download,
                         len(xls))
        enqueue(ProcessFiles, concurrent=True)
Esempio n. 7
0
    def execute(self, jobs=2, subreddit='europe', **kwargs):

        reddit = praw.Reddit(client_id=self.class_config.client_id,
                             client_secret=self.class_config.secret,
                             user_agent='core4os')

        images = []

        for submission in reddit.subreddit(subreddit).hot(limit=100):
            if re.match(r".*(jpg|gif|png)$", submission.url):
                images.append(
                    (submission.url, submission.title, submission.score))

        self.class_config.reddit_coll.insert_one({
            "_id": "_control_" + subreddit,
            "images": images
        })

        for i in range(0, jobs):
            enqueue(RedditDownloader,
                    master=str(self._id),
                    id=subreddit + str(i),
                    subreddit=subreddit)
            time.sleep(1)
Esempio n. 8
0
    async def post(self):
        """
        Identify and store prime number in mongo collection ``prime`` from
        ``start`` to ``end`` using chunks of ``size``.
        """
        start = self.get_argument("start", as_type=int, default=None)
        end = self.get_argument("end", as_type=int, default=None)
        size = self.get_argument("size", as_type=int, default=None)
        if None in (start, end, size):
            return self.render("templates/prime.html", job_id=None)
        kwargs = {"start": start, "end": end, "size": size}
        job = enqueue(PrimeJob, **kwargs)
        if self.wants_html():
            return self.render("templates/prime.html", job_id=str(job._id))
        url = await self.reverse_url("JobStream", str(job._id))
        return self.redirect(url)


# http://devops:5001/core4/api/enter/bc8c3f196df700db3d1420a4d5a4d3b5?start=1&end=10000&size=500&content_type=json
Esempio n. 9
0
 def post(self):
     job_id = enqueue(SwarmJob)
     self.reply(job_id._id)
Esempio n. 10
0
 def execute(self, n=5, sleep=5, **kwargs):
     for i in range(n):
         enqueue("core4.queue.helper.job.example.DummyJob",
                 sleep=sleep,
                 i=i)