Example #1
0
    def __init__(self, loop=None, target=None, name=None, args=(), kwargs={}):
        if not callable(target):
            raise TypeError("`target` needs to be callable, not %r" %
                            (type(target), ))
        self._state = _ProcessState()
        self._loop = loop or IOLoop.current(instance=False)

        # _keep_child_alive is the write side of a pipe, which, when it is
        # closed, causes the read side of the pipe to unblock for reading. Note
        # that it is never closed directly. The write side is closed by the
        # kernel when our process exits, or possibly by the garbage collector
        # closing the file descriptor when the last reference to
        # _keep_child_alive goes away. We can take advantage of this fact to
        # monitor from the child and exit when the parent goes away unexpectedly
        # (for example due to SIGKILL). This variable is otherwise unused except
        # for the assignment here.
        parent_alive_pipe, self._keep_child_alive = mp_context.Pipe(
            duplex=False)

        self._process = mp_context.Process(
            target=self._run,
            name=name,
            args=(target, args, kwargs, parent_alive_pipe,
                  self._keep_child_alive),
        )
        _dangling.add(self._process)
        self._name = self._process.name
        self._watch_q = PyQueue()
        self._exit_future = Future()
        self._exit_callback = None
        self._closed = False

        self._start_threads()
Example #2
0
    def __init__(self,
                 name,
                 providers=[],
                 push_q=SafeQ().async_q,
                 loop=asyncio.get_event_loop(),
                 address=None,
                 mailbox_size=1000,
                 inbox=SafeQ().async_q,
                 empty_demand_logic="broadcast",
                 concurrency=cpu_count(),
                 routing_logic="round_robin",
                 tick_delay=120):
        """
        Constructor

        :param name: Name of the actor
        :type name: str()
        :param push_q: Queue for the provider
        :type push_q: janus.Queue()
        :param loop: Asyncio loop for the actor
        :type loop: AbstractEventLoop()
        :param address: Address for the actor
        :type address: str()
        :param mailbox_size: Size of the mailbox
        :type mailbox_size: int)
        :param inbox: Actor inbox
        :type inbox: asyncio.Queue()
        :param concurrency: The max concurrency in the system
        :type concurrency: int()
        :param routing_logic: round_robin or broadcast
        :param routing_logic: str()
        """
        super().__init__(name, loop, address, mailbox_size, inbox)
        self.push_q = push_q
        self.result_q = PyQueue()
        self.__subscribers = []
        self.__providers = providers
        self.__current_provider = 0
        self.__task_q = PyQueue()
        self.__empty_demand_logic = empty_demand_logic
        self.__concurrency = concurrency
        self.__routing_logic = routing_logic
        self.set_handlers()
        self.tick_delay = tick_delay
        self.run_on_empty(self.__concurrency)
        self.__pull_tick()
Example #3
0
    def __init__(self, name, providers=[], loop=asyncio.get_event_loop(),
                 address=None, mailbox_size=1000, inbox=None,
                 empty_demand_logic = "broadcast", concurrency=cpu_count(),
                 tick_delay=120):
        """
        Constructor

        :param name: Name of the actor
        :type name: str()
        :param loop: Asyncio loop for the actor
        :type loop: AbstractEventLoop()
        :param address: Address for the actor
        :type address: str()
        :param mailbox_size: Size of the mailbox
        :type mailbox_size: int)
        :param inbox: Actor inbox
        :type inbox: asyncio.Queue()
        :param empty_demand_logic: round_robin or broadcast
        :type empty_demand_logic: str()
        :param concurrency: Number concurrent tasks to run
        :type concurrency: int()
        """
        super().__init__(name, loop, address, mailbox_size, inbox)
        self.register_handler(Tick, self.__pull_tick)
        self.subscribers = []
        self.__providers = providers
        self.__current_provider = 0
        self.__task_q = PyQueue()
        self.__empty_logic = empty_demand_logic
        self.__result_q = PyQueue()
        self.router = None
        self.create_router(concurrency)
        self.set_handlers()
        self.tick_delay = tick_delay
        self.__concurrency = concurrency
        self.__pull_tick()
Example #4
0
def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, logger, reindex_dict):
    batch = []
    dircount = 0
    totaldirs = 0
    totalfiles = 0
    starttime = time.time()

    # queue for paths
    q_paths = PyQueue()
    q_paths_results = PyQueue()
    lock = Lock()

    # set up threads for tree walk
    for i in range(cliargs['walkthreads']):
        t = Thread(target=apiwalk_worker, args=(ip, ses, q_paths, q_paths_results, lock,))
        t.daemon = True
        t.start()

    # set up progress bar
    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        widgets = [progressbar.AnimatedMarker(), ' Crawling (Queue: ', progressbar.Counter(),
                   progressbar.FormatLabel(''), ') ', progressbar.Timer()]

        bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength)
        bar.start()
    else:
        bar = None

    bartimestamp = time.time()
    for root, dirs, files in qumulo_api_walk(path, ip, ses, q_paths, q_paths_results):
        dircount += 1
        totaldirs += 1
        files_len = len(files)
        dirs_len = len(dirs)
        totalfiles += files_len
        if dirs_len == 0 and files_len == 0 and not cliargs['indexemptydirs']:
            continue
        if root['path'] != '/':
            root_path = root['path'].rstrip(os.path.sep)
        else:
            root_path = root['path']
        if not dir_excluded(root_path, config, cliargs):
            batch.append((root, dirs, files))
            batch_len = len(batch)
            if batch_len >= batchsize or (cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']):
                q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,),
                                      result_ttl=config['redis_ttl'])
                if cliargs['debug'] or cliargs['verbose']:
                    logger.info("enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize))
                del batch[:]
                if cliargs['adaptivebatch']:
                    batchsize = adaptive_batch(q_crawl, cliargs, batchsize)
                    if cliargs['debug'] or cliargs['verbose']:
                        logger.info("batchsize set to: %s" % batchsize)

            # check if at maxdepth level and delete dirs/files lists to not
            # descend further down the tree
            if cliargs['maxdepth']:
                num_sep_this = root_path.count(os.path.sep)
                if num_sep + level <= num_sep_this:
                    del dirs[:]
                    del files[:]

        else:  # directory excluded
            del dirs[:]
            del files[:]

        # update progress bar
        if bar:
            try:
                if time.time() - bartimestamp >= 2:
                    elapsed = round(time.time() - bartimestamp, 3)
                    dirspersec = round(dircount / elapsed, 3)
                    widgets[4] = progressbar.FormatLabel(', ' + str(dirspersec) + ' dirs/sec) ')
                    bartimestamp = time.time()
                    dircount = 0
                bar.update(len(q_crawl))
            except (ZeroDivisionError, ValueError):
                bar.update(0)

    # add any remaining in batch to queue
    q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl'])

    # set up progress bar with time remaining
    if bar:
        bar.finish()
        bar_max_val = len(q_crawl)
        bar = progressbar.ProgressBar(max_value=bar_max_val)
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q_crawl]):
        if bar:
            q_len = len(q_crawl)
            try:
                bar.update(bar_max_val - q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()

    elapsed = round(time.time() - starttime, 3)
    dirspersec = round(totaldirs / elapsed, 3)

    logger.info("Finished crawling, elapsed time %s sec, dirs walked %s (%s dirs/sec)" %
                (elapsed, totaldirs, dirspersec))