コード例 #1
0
ファイル: th.py プロジェクト: greydoubt/pypeln
    def __init__(self, maxsize, total_done, **kwargs):

        self.queue = Queue(maxsize=maxsize, **kwargs)
        self.lock = Lock()
        self.namespace = _get_namespace()
        self.namespace.remaining = total_done
コード例 #2
0
def execute(trans, tool, mapping_params, history, rerun_remap_job_id=None, collection_info=None, workflow_invocation_uuid=None, invocation_step=None, max_num_jobs=None, job_callback=None, completed_jobs=None, workflow_resource_parameters=None):
    """
    Execute a tool and return object containing summary (output data, number of
    failures, etc...).
    """
    if max_num_jobs:
        assert invocation_step is not None
    if rerun_remap_job_id:
        assert invocation_step is None

    all_jobs_timer = ExecutionTimer()
    if invocation_step is None:
        execution_tracker = ToolExecutionTracker(trans, tool, mapping_params, collection_info)
    else:
        execution_tracker = WorkflowStepExecutionTracker(trans, tool, mapping_params, collection_info, invocation_step, job_callback=job_callback)
    app = trans.app
    execution_cache = ToolExecutionCache(trans)

    def execute_single_job(execution_slice, completed_job):
        job_timer = ExecutionTimer()
        params = execution_slice.param_combination
        if workflow_invocation_uuid:
            params['__workflow_invocation_uuid__'] = workflow_invocation_uuid
        elif '__workflow_invocation_uuid__' in params:
            # Only workflow invocation code gets to set this, ignore user supplied
            # values or rerun parameters.
            del params['__workflow_invocation_uuid__']
        if workflow_resource_parameters:
            params['__workflow_resource_params__'] = workflow_resource_parameters
        elif '__workflow_resource_params__' in params:
            # Only workflow invocation code gets to set this, ignore user supplied
            # values or rerun parameters.
            del params['__workflow_resource_params__']
        job, result = tool.handle_single_execution(trans, rerun_remap_job_id, execution_slice, history, execution_cache, completed_job)
        if job:
            message = EXECUTION_SUCCESS_MESSAGE % (tool.id, job.id, job_timer)
            log.debug(message)
            execution_tracker.record_success(execution_slice, job, result)
        else:
            execution_tracker.record_error(result)

    tool_action = tool.tool_action
    if hasattr(tool_action, "check_inputs_ready"):
        for params in execution_tracker.param_combinations:
            # This will throw an exception if the tool is not ready.
            tool_action.check_inputs_ready(
                tool,
                trans,
                params,
                history
            )

    execution_tracker.ensure_implicit_collections_populated(history, mapping_params.param_template)
    config = app.config
    burst_at = getattr(config, 'tool_submission_burst_at', 10)
    burst_threads = getattr(config, 'tool_submission_burst_threads', 1)

    job_count = len(execution_tracker.param_combinations)

    jobs_executed = 0
    has_remaining_jobs = False

    if (job_count < burst_at or burst_threads < 2):
        for i, execution_slice in enumerate(execution_tracker.new_execution_slices()):
            if max_num_jobs and jobs_executed >= max_num_jobs:
                has_remaining_jobs = True
                break
            else:
                execute_single_job(execution_slice, completed_jobs[i])
    else:
        # TODO: re-record success...
        q = Queue()

        def worker():
            while True:
                params = q.get()
                execute_single_job(params)
                q.task_done()

        for i in range(burst_threads):
            t = Thread(target=worker)
            t.daemon = True
            t.start()

        for i, execution_slice in enumerate(execution_tracker.new_execution_slices()):
            if max_num_jobs and jobs_executed >= max_num_jobs:
                has_remaining_jobs = True
                break
            else:
                q.put(execution_slice, completed_jobs[i])
                jobs_executed += 1

        q.join()

    if has_remaining_jobs:
        raise PartialJobExecution(execution_tracker)
    else:
        execution_tracker.finalize_dataset_collections(trans)

    log.debug("Executed %d job(s) for tool %s request: %s" % (job_count, tool.id, all_jobs_timer))
    return execution_tracker
コード例 #3
0
    def __init__(self,
                 args,
                 stream_stdout=[],
                 stream_stderr=[],
                 buffer_size=0,
                 **kwargs):
        self.args = args

        # self.proc.stdout & self.proc.stderr are streams with process output
        kwargs['stdout'] = kwargs['stderr'] = subprocess.PIPE

        # On UNIX, close all file descriptors except 0, 1, 2 before child
        # process is executed. I've no idea why. Copied from
        # http://stackoverflow.com/a/4896288/100904
        kwargs['close_fds'] = 'posix' in sys.builtin_module_names

        self.proc = subprocess.Popen(args, **kwargs)  # nosec
        self.thread = {}  # Has the running threads
        self.future = {}  # Stores the futures indicating stream close
        self.loop = _get_current_ioloop()

        # Buffering has 2 modes. buffer_size='line' reads and writes line by line
        # buffer_size=<number> reads in byte chunks. Define the appropriate method
        if hasattr(buffer_size, 'lower') and 'line' in buffer_size.lower():

            def _write(stream, callbacks, future, retval):
                '''Call callbacks with content from stream. On EOF mark future as done'''
                while True:
                    content = stream.readline()
                    if len(content) > 0:
                        if isinstance(content, six.text_type):
                            content = content.encode('utf-8')
                        for callback in callbacks:
                            callback(content)
                    else:
                        stream.close()
                        break
                while self.proc.poll() is None:
                    time.sleep(MILLISECOND)
                self.loop.add_callback(future.set_result, retval())
        else:
            # If the buffer size is 0 or negative, use the default buffer size to read
            if buffer_size <= 0:
                buffer_size = io.DEFAULT_BUFFER_SIZE

            def _write(stream, callbacks, future, retval):
                '''Call callbacks with content from stream. On EOF mark future as done'''
                while True:
                    content = stream.read(buffer_size)
                    size = len(content)
                    if size > 0:
                        if isinstance(content, six.text_type):
                            content = content.encode('utf-8')
                        for callback in callbacks:
                            # This may raise a ValueError: write to closed file.
                            # TODO: decide how to handle it.
                            callback(content)
                    if size < buffer_size:
                        stream.close()
                        break
                while self.proc.poll() is None:
                    time.sleep(MILLISECOND)
                self.loop.add_callback(future.set_result, retval())

        callbacks_lookup = {'stdout': stream_stdout, 'stderr': stream_stderr}
        for stream in ('stdout', 'stderr'):
            callbacks = callbacks_lookup[stream]
            # If stream_stdout or stream_stderr are not defined, construct a
            # BytesIO and return its value when the stream is closed
            if not callbacks:
                ret_stream = io.BytesIO()
                callbacks = [ret_stream.write]
                retval = ret_stream.getvalue
            else:
                retval = lambda: b''  # noqa
            # If stream_stdout or stream_stderr has 'out' or 'err', create these
            # as queue attributes (self.out, self.err)
            callbacks = list(callbacks) if isinstance(callbacks,
                                                      list) else [callbacks]
            for index, method in enumerate(callbacks):
                if isinstance(method, six.string_types):
                    if method.startswith('list_'):
                        if hasattr(self, method):
                            callbacks[index] = getattr(self, method).append
                        else:
                            log = []
                            setattr(self, method, log)
                            callbacks[index] = log.append
                    elif method.startswith('queue_'):
                        if hasattr(self, method):
                            callbacks[index] = getattr(self, method).put
                        else:
                            log = Queue()
                            setattr(self, method, log)
                            callbacks[index] = log.put
                    else:
                        raise ValueError('Invalid stream_%s: %s', stream,
                                         method)
            self.future[stream] = future = Future()
            # Thread writes from self.proc.stdout / stderr to appropriate callbacks
            self.thread[stream] = t = Thread(target=_write,
                                             name=f'cache.Subprocess: {args}',
                                             args=(getattr(self.proc, stream),
                                                   callbacks, future, retval))
            t.daemon = True  # Thread dies with the program
            t.start()
コード例 #4
0
def run_all_tests(tests, prefix, pb, options):
    """
    Uses scatter-gather to a thread-pool to manage children.
    """
    qTasks, qResults = Queue(), Queue()

    workers = []
    watchdogs = []
    for _ in range(options.worker_count):
        qWatch = Queue()
        watcher = Thread(target=_do_watch, args=(qWatch, options.timeout))
        watcher.setDaemon(True)
        watcher.start()
        watchdogs.append(watcher)
        worker = Thread(
            target=_do_work,
            args=(
                qTasks,
                qResults,
                qWatch,
                prefix,
                options.run_skipped,
                options.timeout,
                options.show_cmd,
            ),
        )
        worker.setDaemon(True)
        worker.start()
        workers.append(worker)

    # Insert all jobs into the queue, followed by the queue-end
    # marker, one per worker. This will not block on growing the
    # queue, only on waiting for more items in the generator. The
    # workers are already started, however, so this will process as
    # fast as we can produce tests from the filesystem.
    def _do_push(num_workers, qTasks):
        for test in tests:
            qTasks.put(test)
        for _ in range(num_workers):
            qTasks.put(EndMarker)

    pusher = Thread(target=_do_push, args=(len(workers), qTasks))
    pusher.setDaemon(True)
    pusher.start()

    # Read from the results.
    ended = 0
    delay = ProgressBar.update_granularity().total_seconds()
    while ended < len(workers):
        try:
            result = qResults.get(block=True, timeout=delay)
            if result is EndMarker:
                ended += 1
            else:
                yield result
        except Empty:
            pb.poke()

    # Cleanup and exit.
    pusher.join()
    for worker in workers:
        worker.join()
    for watcher in watchdogs:
        watcher.join()
    assert qTasks.empty(), "Send queue not drained"
    assert qResults.empty(), "Result queue not drained"
コード例 #5
0
class _RunBase(ChainedIdentity, HasRunPortal):

    _registered_kill_handlers = Queue()

    def __init__(self, experiment, run_id, outputs=None, logs=None,
                 _run_dto=None, _worker_pool=None, _user_agent=None, _ident=None,
                 _batch_upload_metrics=True, py_wd=None, deny_list=None,
                 flush_eager=False, redirect_output_stream=True, **kwargs):
        """
        :param experiment: The experiment.
        :type experiment: azureml.core.experiment.Experiment
        :param run_id: The run id for the run.
        :type run_id: str
        :param outputs: The outputs to be tracked
        :type outputs:
        :param logs: The logs directory to be tracked
        :type logs:
        :param _worker_pool: The worker pool for async tasks
        :type _worker_pool: azureml._async.worker_pool.WorkerPool
        """
        # _worker_pool needed for backwards compat

        from azureml._run_impl.run_history_facade import RunHistoryFacade

        self._experiment = experiment

        self._run_id = run_id

        _ident = _ident if _ident is not None else ChainedIdentity.DELIM.join([self.__class__.__name__, self._run_id])

        # We need to do this in order to resolve the history object :(
        # Get rid of just name and pass the objects around BUT
        # TODO: Everything needs to use the *SAME PARAMETER NAMES*
        super(_RunBase, self).__init__(
            experiment=self._experiment,
            run_id=self._run_id,
            _ident=_ident,
            **kwargs)

        user_agent = _user_agent if _user_agent is not None else RUN_USER_AGENT

        # Create an outputs directory if one does not exist
        if outputs is not None:
            outputs = [outputs] if isinstance(outputs, str) else outputs
        else:
            outputs = []

        for output in outputs:
            try:
                os.makedirs(output)
            except OSError as exception:
                if exception.errno != errno.EEXIST:
                    raise

        py_wd = get_py_wd() if py_wd is None else py_wd

        self._client = RunHistoryFacade(self._experiment, self._run_id, RUN_ORIGIN, run_dto=_run_dto,
                                        worker_pool=self._experiment.workspace.service_context.worker_pool,
                                        outputs=outputs, py_wd=py_wd, deny_list=deny_list,
                                        user_agent=user_agent, _parent_logger=self._logger,
                                        _batch_upload_metrics=_batch_upload_metrics, flush_eager=flush_eager)

        # self._run_dto property does some time-expensive serialization
        # so just materialize it once for use to populate all other fields
        _run_dto_as_dict = self._run_dto

        self._root_run_id = _run_dto_as_dict["root_run_id"]
        self._outputs = outputs
        self._run_number = _string_to_int(_run_dto_as_dict["run_number"], "run number")
        self._run_source = _run_dto_as_dict.get("properties", {}).get("azureml.runsource", None)
        self._runtype = _run_dto_as_dict.get("run_type", self._run_source)
        self._run_name = _run_dto_as_dict.get("name", None)
        self._logger.debug("Constructing run from dto. type: %s, source: %s, props: %s",
                           self._runtype,
                           self._run_source,
                           _run_dto_as_dict.get("properties", {}))
        run_type_v2 = _run_dto_as_dict.get("run_type_v2", None)
        if run_type_v2:
            self._runtype_v2 = RunTypeV2(orchestrator=run_type_v2.get("orchestrator", None),
                                         traits=run_type_v2.get("traits", None))
        self._context_manager = RunContextManager(self, logs=logs,
                                                  heartbeat_enabled=_run_dto_as_dict.get("heartbeat_enabled", False),
                                                  _parent_logger=self._logger, py_wd=py_wd,
                                                  redirect_output_stream=redirect_output_stream)
        self._register_kill_handler(self._cleanup)

    @classmethod
    def get_docs_url(cls):
        return get_docs_url(cls)

    @property
    def _run_dto(self):
        """Return the internal representation of a run."""

        run_dto = self._client.run_dto

        if isinstance(run_dto, dict):
            self._logger.debug("Return run dto as existing dict")
            return run_dto
        else:
            return self._client.run.dto_to_dictionary(run_dto)

    def _get_base_info_dict(self):
        return OrderedDict([
            ('Experiment', self._experiment.name),
            ('Id', self._run_id),
            ('Type', self._runtype),
            ('Status', self._client.run_dto.status)
        ])

    def __str__(self):
        info = self._get_base_info_dict()
        formatted_info = ',\n'.join(["{}: {}".format(k, v) for k, v in info.items()])
        return "Run({0})".format(formatted_info)

    def __repr__(self):
        return self.__str__()

    def _repr_html_(self):
        info = self._get_base_info_dict()
        info.update([
            ('Details Page', make_link(self.get_portal_url(), "Link to Azure Machine Learning studio")),
            ('Docs Page', make_link(self.get_docs_url(), "Link to Documentation"))
        ])
        return to_html(info)

    def __enter__(self):
        return self._context_manager.__enter__()

    def __exit__(self, exit_type, value, traceback):
        return self._context_manager.__exit__(exit_type, value, traceback)

    def _heartbeat(self):
        self._client.run.post_event_heartbeat(HEARTBEAT_INTERVAL)

    @classmethod
    def _kill(cls, timeout=40):
        print("Cleaning up all outstanding Run operations, waiting {} seconds".format(timeout))
        handlers = []
        while True:
            try:
                handlers.append(cls._registered_kill_handlers.get_nowait())
            except Empty as e:
                break

        print("{} items cleaning up...".format(len(handlers)))
        start_time = time.time()
        end_time = start_time + timeout
        for handler in handlers:
            if time.time() > end_time:
                module_logger.warn("Could not clean up all items! Data loss might occur!")
                return
            handler(timeout)

        print("Cleanup took {} seconds".format(time.time() - start_time))

    @classmethod
    def _register_kill_handler(cls, work):
        cls._registered_kill_handlers.put(work)

    def _cleanup(self, timeout):
        # TODO: Structure this better once we know of more cases
        self._client.flush(timeout)
コード例 #6
0
    def __init__(self, config, maxCores, maxMemory, maxDisk):

        # Limit to the smaller of the user-imposed limit and what we actually
        # have on this machine for each resource.
        #
        # If we don't have up to the limit of the resource (and the resource
        # isn't the inlimited sentinel), warn.
        if maxCores > self.numCores:
            if maxCores != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough cores! User limited to %i but we only have %i.',
                    maxCores, self.numCores)
            maxCores = self.numCores
        if maxMemory > self.physicalMemory:
            if maxMemory != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough memory! User limited to %i bytes but we only have %i bytes.',
                    maxMemory, self.physicalMemory)
            maxMemory = self.physicalMemory
        self.physicalDisk = toil.physicalDisk(config)
        if maxDisk > self.physicalDisk:
            if maxDisk != sys.maxsize:
                # We have an actually specified limit and not the default
                log.warning(
                    'Not enough disk space! User limited to %i bytes but we only have %i bytes.',
                    maxDisk, self.physicalDisk)
            maxDisk = self.physicalDisk

        super(SingleMachineBatchSystem, self).__init__(config, maxCores,
                                                       maxMemory, maxDisk)
        assert self.maxCores >= self.minCores
        assert self.maxMemory >= 1

        # The scale allows the user to apply a factor to each task's cores requirement, thereby
        # squeezing more tasks onto each core (scale < 1) or stretching tasks over more cores
        # (scale > 1).
        self.scale = config.scale

        if config.badWorker > 0 and config.debugWorker:
            # We can't throw SIGUSR1 at the worker because it is also going to
            # be the leader and/or test harness.
            raise RuntimeError(
                "Cannot use badWorker and debugWorker together; "
                "worker would have to kill the leader")

        self.debugWorker = config.debugWorker

        # A counter to generate job IDs and a lock to guard it
        self.jobIndex = 0
        self.jobIndexLock = Lock()

        # A dictionary mapping IDs of submitted jobs to the command line
        self.jobs = {}
        """
        :type: dict[str,toil.job.JobNode]
        """

        # A queue of jobs waiting to be executed. Consumed by the daddy thread.
        self.inputQueue = Queue()

        # A queue of finished jobs. Produced by the daddy thread.
        self.outputQueue = Queue()

        # A dictionary mapping IDs of currently running jobs to their Info objects
        self.runningJobs = {}
        """
        :type: dict[str,Info]
        """

        # These next two are only used outside debug-worker mode

        # A dict mapping PIDs to Popen objects for running jobs.
        # Jobs that don't fork are executed one at a time in the main thread.
        self.children = {}
        """
        :type: dict[int,subprocess.Popen]
        """
        # A dict mapping child PIDs to the Job IDs they are supposed to be running.
        self.childToJob = {}
        """
        :type: dict[int,str]
        """

        # A pool representing available CPU in units of minCores
        self.coreFractions = ResourcePool(
            int(old_div(self.maxCores, self.minCores)), 'cores')
        # A pool representing available memory in bytes
        self.memory = ResourcePool(self.maxMemory, 'memory')
        # A pool representing the available space in bytes
        self.disk = ResourcePool(self.maxDisk, 'disk')

        # If we can't schedule something, we fill this in with a reason why
        self.schedulingStatusMessage = None

        # We use this event to signal shutdown
        self.shuttingDown = Event()

        # A thread in charge of managing all our child processes.
        # Also takes care of resource accounting.
        self.daddyThread = None
        # If it breaks it will fill this in
        self.daddyException = None

        if self.debugWorker:
            log.debug('Started in worker debug mode.')
        else:
            self.daddyThread = Thread(target=self.daddy, daemon=True)
            self.daddyThread.start()
            log.debug('Started in normal mode.')
コード例 #7
0
ファイル: batchSystem.py プロジェクト: mkiwala/toil
    def __init__(self, config, maxCores, maxMemory, maxDisk):
        super(MesosBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk)

        # The hot-deployed resource representing the user script. Will be passed along in every
        # Mesos task. Also see setUserScript().
        self.userScript = None
        """
        :type: toil.resource.Resource
        """

        # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type,
        # defined by resource usage
        self.jobQueues = JobQueue()

        # Address of the Mesos master in the form host:port where host can be an IP or a hostname
        self.mesosMasterAddress = config.mesosMasterAddress

        # Written to when Mesos kills tasks, as directed by Toil
        self.killedJobIds = set()

        # The IDs of job to be killed
        self.killJobIds = set()

        # Contains jobs on which killBatchJobs were called, regardless of whether or not they
        # actually were killed or ended by themselves
        self.intendedKill = set()

        # Map of host address to job ids
        # this is somewhat redundant since Mesos returns the number of workers per
        # node. However, that information isn't guaranteed to reach the leader,
        # so we also track the state here. When the information is returned from
        # mesos, prefer that information over this attempt at state tracking.
        self.hostToJobIDs = {}

        # see self.setNodeFilter
        self.nodeFilter = []

        # Dict of launched jobIDs to TaskData objects
        self.runningJobMap = {}

        # Mesos has no easy way of getting a task's resources so we track them here
        self.taskResources = {}

        # Queue of jobs whose status has been updated, according to Mesos
        self.updatedJobsQueue = Queue()

        # The Mesos driver used by this scheduler
        self.driver = None

        # A dictionary mapping a node's IP to an ExecutorInfo object describing important
        # properties of our executor running on that node. Only an approximation of the truth.
        self.executors = {}

        # A set of Mesos slave IDs, one for each slave running on a non-preemptable node. Only an
        #  approximation of the truth. Recently launched nodes may be absent from this set for a
        # while and a node's absence from this set does not imply its preemptability. But it is
        # generally safer to assume a node is preemptable since non-preemptability is a stronger
        # requirement. If we tracked the set of preemptable nodes instead, we'd have to use
        # absence as an indicator of non-preemptability and could therefore be misled into
        # believeing that a recently launched preemptable node was non-preemptable.
        self.nonPreemptableNodes = set()

        self.executor = self._buildExecutor()

        self.unusedJobID = itertools.count()
        self.lastReconciliation = time.time()
        self.reconciliationPeriod = 120

        # These control how frequently to log a message that would indicate if no jobs are
        # currently able to run on the offers given. This can happen if the cluster is busy
        # or if the nodes in the cluster simply don't have enough resources to run the jobs
        self.lastTimeOfferLogged = 0
        self.logPeriod = 30  # seconds

        self._startDriver()
コード例 #8
0
 def test_item_is_none_when_timeout_is_hit(self):
     queue = Queue()
     generator = consume_queue(queue, False)
     assert next(generator) is None
コード例 #9
0
ファイル: reader_v2.py プロジェクト: wxrui/petastorm
    def __init__(self,
                 dataset_url,
                 schema_fields=None,
                 predicate=None,
                 rowgroup_selector=None,
                 num_epochs=1,
                 sequence=None,
                 cur_shard=None,
                 shard_count=None,
                 read_timeout_s=None,
                 cache=None,
                 loader_pool=None,
                 decoder_pool=None,
                 shuffling_queue=None,
                 shuffle_row_groups=True,
                 shuffle_row_drop_partitions=1,
                 pyarrow_filesystem=None,
                 hdfs_driver='libhdfs3'):
        """Initializes a reader object.

        :param dataset_url: an filepath or a url to a parquet directory,
                       e.g. 'hdfs://some_hdfs_cluster/user/yevgeni/parquet8', or '/tmp/mydataset'
                       or ``'s3://bucket/mydataset'``.
        :param schema_fields:
            Either list of unischema fields to subset, or None to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param reader_pool: parallelization pool. ThreadPool(10) (10 threads) is used by default.
                       This pool is a custom implementation used to parallelize reading data from the dataset.
                       Any object from workers_pool package can be used (e.g. ProcessPool)
        :param num_epochs: An epoch is a single pass over all samples in the dataset. Setting num_epochs to 'None' will
                       result in an infinite number of epochs.
        :param sequence: This is deprecated. To use sequence/ngram, please supply the argument in schema_fields instead.
        :param cur_shard: An int denoting the current shard number. Each node reading a shard should
                       pass in a unique shard number in the range [0, shard_count).
                       shard count must be supplied as well.
        :param shard_count An int denoting the number of shards to break this dataset into.
        :param read_timeout_s: A numeric with the amount of time in seconds you would like to give a read before it
                       times out and raises an EmptyResultError. Pass in None for an infinite timeout
        :param cache: An object conforming to `cache.CacheBase` interface. Before loading row groups from a parquet file
                       the Reader will attempt to load these values from cache. Caching is useful when communication
                       to the main data store is either slow or expensive and the local machine has large enough storage
                       to store entire dataset (or a partition of a dataset if num_training_partitions is used).
        :param decoder_pool: An instance of a concurrent.futures pool executor used for decoding. If None,
          a default ThreadPoolExecutor(5) will be used.
        :param loader_pool: An instance of a concurrent.futures pool executor used for decoding. If None,
          a default ThreadPoolExecutor(5) will be used.
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)

        By default, `NullCache` implementation
        """

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        # 2. Get a list of all groups
        # 3. Filter rowgroups
        #    a. predicates
        #    b. row-group selector (our indexing mechanism)
        #    c. partition: used to get a subset of data for distributed training
        # 4. Launch a new thread running `worker_loop` function.

        if dataset_url is None or not isinstance(dataset_url,
                                                 six.string_types):
            raise ValueError("""dataset_url must be a string""")

        if not (isinstance(schema_fields, collections.Iterable)
                or isinstance(schema_fields, NGram) or schema_fields is None):
            raise ValueError(
                """Fields must be either None, an iterable collection of Unischema fields or an NGram
            object.""")

        if sequence is not None:
            raise ValueError(
                """'sequence' argument of Reader object is deprecated. Please pass an NGram instance to
            'schema_fields' argument instead.""")

        # Can not rely on a check in epochs.py since it runs on a separate thread. Inform user earlier about invalid
        # argument value.
        if num_epochs is not None and (not isinstance(num_epochs, int)
                                       or num_epochs < 1):
            raise ValueError('iterations must be positive integer or None')

        self.ngram = schema_fields if isinstance(schema_fields,
                                                 NGram) else None

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1:
            raise NotImplementedError(
                'Using timestamp_overlap=False is not implemented with'
                ' shuffle_options.shuffle_row_drop_partitions > 1')

        cache = cache or NullCache()
        dataset_url = dataset_url[:-1] if dataset_url[
            -1] == '/' else dataset_url

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        logger.debug('dataset_url: %s', dataset_url)

        if pyarrow_filesystem is not None:
            filesystem = pyarrow_filesystem
            dataset_path = urlparse(dataset_url).path
        else:
            resolver = FilesystemResolver(dataset_url)
            filesystem = resolver.filesystem()
            dataset_path = resolver.get_dataset_path()

        self._dataset = pq.ParquetDataset(dataset_path,
                                          filesystem=filesystem,
                                          validate_schema=False)

        shuffle_row_drop_partitions = self._normalize_shuffle_options(
            shuffle_row_drop_partitions, self._dataset)

        stored_schema = infer_or_load_unischema(self._dataset)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        fields = schema_fields if isinstance(schema_fields,
                                             collections.Iterable) else None
        self.schema = stored_schema.create_schema_view(
            fields) if fields else stored_schema

        # 2. Get a list of all groups
        row_groups = dataset_metadata.load_row_groups(self._dataset)

        # 3. Filter rowgroups
        filtered_row_groups, worker_predicate = self._filter_row_groups(
            self._dataset, row_groups, predicate, rowgroup_selector, cur_shard,
            shard_count)

        epoch_items = self._apply_row_drop_partition(
            filtered_row_groups, shuffle_row_drop_partitions)

        # 4. Launch a new thread running `worker_loop` function.
        def epochs_iterator():
            return epoch_generator(epoch_items, num_epochs, shuffle_row_groups)

        self._results_queue = Queue(_OUTPUT_QUEUE_SIZE)

        loader = RowGroupLoader(dataset_url,
                                self.schema,
                                self.ngram,
                                cache,
                                worker_predicate,
                                hdfs_driver=hdfs_driver)
        decoder = RowDecoder(self.schema, self.ngram)
        self._loader_pool = loader_pool or ThreadPoolExecutor(5)
        self._decoder_pool = decoder_pool or ThreadPoolExecutor(5)
        self._stop_flow_manager_event = threading.Event()
        self._diags = Counter()

        if not shuffling_queue:
            shuffling_queue = NoopShufflingBuffer()

        self._flow_manager_thread = threading.Thread(
            target=worker_loop,
            args=(epochs_iterator, self._loader_pool, loader,
                  self._decoder_pool, decoder, shuffling_queue,
                  self._results_queue, self._stop_flow_manager_event,
                  self._diags))
        self._flow_manager_thread.daemon = True
        self._flow_manager_thread.start()

        self._read_timeout_s = read_timeout_s
        self.batched_output = False
コード例 #10
0
    def handle_one_request(self):
        """
        This is the main HTTP/2.0 Handler.

        When a browser opens a connection to the server
        on the HTTP/2.0 port, the server enters this which will initiate the h2 connection
        and keep running throughout the duration of the interaction, and will read/write directly
        from the socket.

        Because there can be multiple H2 connections active at the same
        time, a UUID is created for each so that it is easier to tell them apart in the logs.
        """

        config = H2Configuration(client_side=False)
        self.conn = H2ConnectionGuard(H2Connection(config=config))
        self.close_connection = False

        # Generate a UUID to make it easier to distinguish different H2 connection debug messages
        self.uid = str(uuid.uuid4())[:8]

        self.logger.debug('(%s) Initiating h2 Connection' % self.uid)

        with self.conn as connection:
            connection.initiate_connection()
            data = connection.data_to_send()
            window_size = connection.remote_settings.initial_window_size

        self.request.sendall(data)

        # Dict of { stream_id: (thread, queue) }
        stream_queues = {}

        try:
            while not self.close_connection:
                data = self.request.recv(window_size)
                if data == '':
                    self.logger.debug('(%s) Socket Closed' % self.uid)
                    self.close_connection = True
                    continue

                with self.conn as connection:
                    frames = connection.receive_data(data)
                    window_size = connection.remote_settings.initial_window_size

                self.logger.debug('(%s) Frames Received: ' % self.uid +
                                  str(frames))

                for frame in frames:
                    if isinstance(frame, ConnectionTerminated):
                        self.logger.debug(
                            '(%s) Connection terminated by remote peer ' %
                            self.uid)
                        self.close_connection = True

                        # Flood all the streams with connection terminated, this will cause them to stop
                        for stream_id, (thread,
                                        queue) in stream_queues.items():
                            queue.put(frame)

                    elif hasattr(frame, 'stream_id'):
                        if frame.stream_id not in stream_queues:
                            queue = Queue()
                            stream_queues[frame.stream_id] = (
                                self.start_stream_thread(frame, queue), queue)
                        stream_queues[frame.stream_id][1].put(frame)

                        if isinstance(frame, StreamEnded) or (hasattr(
                                frame, "stream_ended") and frame.stream_ended):
                            del stream_queues[frame.stream_id]

        except (socket.timeout, socket.error) as e:
            self.logger.error('(%s) Closing Connection - \n%s' %
                              (self.uid, str(e)))
            if not self.close_connection:
                self.close_connection = True
                for stream_id, (thread, queue) in stream_queues.items():
                    queue.put(None)
        except Exception as e:
            self.logger.error('(%s) Unexpected Error - \n%s' %
                              (self.uid, str(e)))
        finally:
            for stream_id, (thread, queue) in stream_queues.items():
                thread.join()
コード例 #11
0
def parallel_execute(objects, obj_callable, msg_index, msg):
    """
    For a given list of objects, call the callable passing in the first
    object we give it.
    """
    stream = get_output_stream(sys.stdout)
    lines = []

    for obj in objects:
        write_out_msg(stream, lines, msg_index(obj), msg)

    q = Queue()

    def inner_execute_function(an_callable, parameter, msg_index):
        error = None
        try:
            result = an_callable(parameter)
        except APIError as e:
            error = e.explanation
            result = "error"
        except Exception as e:
            error = e
            result = 'unexpected_exception'

        q.put((msg_index, result, error))

    for an_object in objects:
        t = Thread(
            target=inner_execute_function,
            args=(obj_callable, an_object, msg_index(an_object)),
        )
        t.daemon = True
        t.start()

    done = 0
    errors = {}
    total_to_execute = len(objects)

    while done < total_to_execute:
        try:
            msg_index, result, error = q.get(timeout=1)

            if result == 'unexpected_exception':
                errors[msg_index] = result, error
            if result == 'error':
                errors[msg_index] = result, error
                write_out_msg(stream, lines, msg_index, msg, status='error')
            else:
                write_out_msg(stream, lines, msg_index, msg)
            done += 1
        except Empty:
            pass

    if not errors:
        return

    stream.write("\n")
    for msg_index, (result, error) in errors.items():
        stream.write("ERROR: for {}  {} \n".format(msg_index, error))
        if result == 'unexpected_exception':
            raise error
コード例 #12
0
ファイル: cache_service.py プロジェクト: pessom/grab
 def create_input_queue(self):
     return Queue()
コード例 #13
0
ファイル: auto_dataset.py プロジェクト: bityangke/aetros-cli
def get_images(job_model, dataset, node, trainer):
    concurrent = 15

    from PIL import ImageFile
    if hasattr(ImageFile, 'LOAD_TRUNCATED_IMAGES'):
        ImageFile.LOAD_TRUNCATED_IMAGES = True

    q = Queue(concurrent)
    config = dataset['config']

    dir = trainer.job_model.get_dataset_downloads_dir(dataset)

    ensure_dir(dir)

    if 'classes' not in config or not config['classes']:
        print("Dataset %s does not contain any classes." % (dataset['id'], ))
        return {
            'X_train': np.array([]),
            'Y_train': np.array([]),
            'X_test': np.array([]),
            'Y_test': np.array([])
        }

    classes = config['classes']

    trainer.set_status('PREPARE_IMAGES')

    max = 0
    images = {}

    dataset_path = trainer.job_model.get_dataset_downloads_dir(dataset)
    meta_information_file = dataset_path + '/meta.json'

    classes_changed = False
    config_changed = False
    had_previous = False
    classes_md5 = hashlib.md5(
        json.dumps(classes,
                   default=invalid_json_values).encode('utf-8')).hexdigest()

    validationFactor = 0.2

    if os.path.isdir(dataset_path):
        if os.path.isfile(meta_information_file):
            with open(meta_information_file) as f:
                meta = json.load(f)
                if meta:
                    had_previous = True
                    if 'classes_md5' in meta and meta[
                            'classes_md5'] != classes_md5:
                        classes_changed = True

                    trigger_changed = [
                        'resize', 'resizeWidth', 'resizeHeight',
                        'resizeCompression'
                    ]
                    for i in trigger_changed:
                        if i in meta['config'] and i in config and meta[
                                'config'][i] != config[i]:
                            config_changed = True
                else:
                    config_changed = True
        else:
            config_changed = True

    need_download = classes_changed or config_changed

    if need_download:
        if had_previous:
            print("Reset dataset and re-download images to " + dir)
            if classes_changed:
                print(" .. because classes changed")
            if config_changed:
                print(" .. because settings changed")
        else:
            print("Download images to " + dir)

        resize = bool(get_option(config, 'resize', True))
        if resize:
            resizeSize = (int(get_option(config, 'resizeWidth', 64)),
                          int(get_option(config, 'resizeHeight', 64)))
            print(" .. with resizing to %dx%d " % resizeSize)

        # we need to donwload all images
        shutil.rmtree(dataset_path)

        controller = {'running': True}
        try:
            for category in classes:
                max += len(category['images'])

            for i in range(concurrent):
                t = ImageDownloaderWorker(q, trainer, dataset, max, images,
                                          controller)
                t.daemon = True
                t.start()

            for category_idx, category in enumerate(classes):
                for image in category['images']:
                    q.put([image, category_idx])

            q.join()
            controller['running'] = False

            def move_image(image, category='training'):
                if image['id'] in images and os.path.isfile(
                        images[image['id']]):
                    target_path = dataset_path + \
                        '/%s/category_%s/%s' % (category, category_idx,
                                                os.path.basename(images[image['id']]))
                    ensure_dir(os.path.dirname(target_path))
                    os.rename(images[image['id']], target_path)

            for category_idx, category in enumerate(classes):
                random.shuffle(category['images'])
                position = int(
                    math.ceil(len(category['images']) * validationFactor))

                ensure_dir(dataset_path + '/training')
                ensure_dir(dataset_path + '/validation')

                for image in category['images'][position:]:  # test data
                    if image['id'] in images and os.path.isfile(
                            images[image['id']]):
                        move_image(image, 'training')

                for image in category['images'][:position]:  # validation data
                    if image['id'] in images and os.path.isfile(
                            images[image['id']]):
                        move_image(image, 'validation')

            with open(meta_information_file, 'w') as f:
                meta = {
                    'loaded_at': classes_md5,
                    'classes_md5': classes_md5,
                    'config': config
                }
                json.dump(meta, f, default=invalid_json_values)

        except KeyboardInterrupt:
            controller['running'] = False
            sys.exit(1)
    else:
        print("Downloaded images up2date in " + dir)
        print(
            " - Remove this directory if you want to re-download all images of your dataset and re-shuffle training/validation images."
        )

    trainer.output_size = len(classes)
    trainer.set_status('LOAD IMAGE DONE')

    # change to type local_images
    dataset_transformed = dataset.copy()
    dataset_transformed['config']['path'] = dir

    all_memory = get_option(dataset['config'], 'allMemory', False, 'bool')

    if all_memory:
        return read_images_in_memory(job_model, dataset_transformed, node,
                                     trainer)
    else:
        return read_images_keras_generator(job_model, dataset_transformed,
                                           node, trainer)
コード例 #14
0
ファイル: auto_dataset.py プロジェクト: bityangke/aetros-cli
def read_images_in_memory(job_model, dataset, node, trainer):
    """
    Reads all images into memory and applies augmentation if enabled
    """
    concurrent = psutil.cpu_count()

    dataset_config = dataset['config']
    controller = {'running': True}
    config = dataset['config']  # TODO: config not used
    q = Queue(concurrent)

    result = {'X_train': [], 'Y_train': [], 'X_test': [], 'Y_test': []}

    images = []
    max = 0

    path = trainer.job_model.get_dataset_downloads_dir(dataset)
    if 'path' in dataset['config']:
        path = dataset['config']['path']

    classes_count = 0
    category_map = {}
    classes = []

    try:
        for i in range(concurrent):
            t = ImageReadWorker(q, job_model, node, path, images, controller)
            t.daemon = True
            t.start()

        for validation_or_training in ['validation', 'training']:
            if os.path.isdir(path + '/' + validation_or_training):
                for category_name in os.listdir(path + '/' +
                                                validation_or_training):
                    if os.path.isdir(path + '/' + validation_or_training +
                                     '/' + category_name):

                        if category_name not in category_map:
                            category_map[category_name] = classes_count
                            if 'classes' in dataset_config and 'category_' in category_name:
                                category_idx = int(
                                    category_name.replace('category_', ''))
                                category_map[category_name] = category_idx
                                target_category = dataset_config['classes'][
                                    category_idx]
                                classes.append(target_category['title']
                                               or 'Class %s' %
                                               (category_idx, ))
                            else:
                                classes.append(category_name)

                            classes_count += 1

                        for id in os.listdir(path + '/' +
                                             validation_or_training + '/' +
                                             category_name):
                            file_path = os.path.join(path,
                                                     validation_or_training,
                                                     category_name, id)
                            q.put([
                                file_path,
                                validation_or_training == 'validation',
                                category_name
                            ])
                            max += 1

        q.join()
        controller['running'] = False

        train_images = []
        test_images = []

        for v in images:
            image, validation, category_dir = v
            if validation is True:
                test_images.append([image, category_map[category_dir]])
            else:
                train_images.append([image, category_map[category_dir]])

        train_datagen = None
        augmentation = bool(get_option(dataset_config, 'augmentation', False))
        if augmentation:
            train_datagen = get_image_data_augmentor_from_dataset(dataset)

        train = InMemoryDataGenerator(
            train_datagen, train_images, classes_count,
            job_model.job['config']['settings']['batchSize'])

        test = InMemoryDataGenerator(
            None, test_images, classes_count,
            job_model.job['config']['settings']['batchSize'])

        nb_sample = len(train_images)
        trainer.set_generator_training_nb(nb_sample)
        trainer.set_generator_validation_nb(len(test_images))

        print((
            "Found %d classes, %d images (%d in training [%saugmented], %d in validation). Read all images into memory from %s"
            %
            (classes_count, max, len(train_images),
             'not ' if augmentation is False else '', len(test_images), path)))

        if classes_count == 0:
            print(
                "Could not find any classes. Does the directory contains images?"
            )
            sys.exit(1)

        trainer.output_size = classes_count
        trainer.set_job_system_info('classes', classes)
        trainer.classes = classes

        result['X_train'] = train
        result['Y_train'] = train
        result['X_test'] = test
        result['Y_test'] = test

        return result

    except KeyboardInterrupt:
        controller['running'] = False
        sys.exit(1)
コード例 #15
0
ファイル: taskthreads.py プロジェクト: cjwatson/zope.server
 def __init__(self):
     self.threads = {}  # { thread number -> 1 }
     self.queue = Queue()
     self.thread_mgmt_lock = threading.Lock()
コード例 #16
0
ファイル: follower.py プロジェクト: wmsby/easytrader
    def __init__(self):
        self.trade_queue = Queue()
        self.expired_cmds = set()

        self.s = requests.Session()
コード例 #17
0
ファイル: audio.py プロジェクト: bkerler/sahara_emulator
 def __init__(self, app, phy, tx_ep, rx_ep):
     self.app = app
     self.phy = phy
     self.tx_ep = tx_ep
     self.rx_ep = rx_ep
     self.txq = Queue()
コード例 #18
0
ファイル: collect_types.py プロジェクト: elvslv/pyannotate
        item = _task_queue.get()
        if isinstance(item, KeyAndTypes):
            if item.key in collected_args:
                # Previous call didn't get a corresponding return, perhaps because we
                # stopped collecting types in the middle of a call or because of
                # a recursive function.
                _flush_signature(item.key, UnknownType)
            collected_args[item.key] = ArgTypes(item.types)
        else:
            assert isinstance(item, KeyAndReturn)
            if item.key in collected_args:
                _flush_signature(item.key, item.return_type)
        _task_queue.task_done()


_task_queue = Queue()  # type: Queue[Union[KeyAndTypes, KeyAndReturn]]
_consumer_thread = Thread(target=type_consumer)
_consumer_thread.daemon = True
_consumer_thread.start()

running = False

TOP_DIR = os.path.join(os.getcwd(), '')     # current dir with trailing slash
TOP_DIR_DOT = os.path.join(TOP_DIR, '.')
TOP_DIR_LEN = len(TOP_DIR)


def _make_sampling_sequence(n):
    # type: (int) -> List[int]
    """
    Return a list containing the proposed call event sampling sequence.
コード例 #19
0
ファイル: scheduler.py プロジェクト: zhaoduoyu/demo
 def __init__(self):
     self.q = Queue()
コード例 #20
0
[server]
hostname = localhost
prefix = /
port = {port}
insecure = 1
proxy_hostname =

[rhsm]
consumerCertDir = {certdir}
""".format(port=self.port, certdir=base))

        rhsm_config.DEFAULT_CONFIG_PATH = config_name

        self.server.sam = self
        self.server.queue = queue

    def terminate(self):
        shutil.rmtree(self.tempdir)
        super(FakeSam, self).terminate()


if __name__ == '__main__':
    if len(sys.argv) >= 2:
        code = int(sys.argv[1])
    else:
        code = None
    from six.moves.queue import Queue
    q = Queue()
    f = FakeSam(q, port=8443, code=code, host='0.0.0.0')
    f.run()
コード例 #21
0
def events_kinesis(network, access_key_id="", secret_access_key=""):
    """Yield a stream of events from a Parse.ly Kinesis Stream

    :param network: The Parse.ly network name for which to perform reads (eg
        "blog.parsely.com")
    :type network: str
    :param access_key_id: The AWS access key to use when consuming the stream
    :type access_key_id: str
    :param secret_access_key: The AWS secret key to use when consuming the stream
    :type secret_access_key: str
    """
    client = boto3.client(
        'kinesis',
        aws_access_key_id=access_key_id,
        aws_secret_access_key=secret_access_key
    )
    stream = "parsely-dw-{}".format(utils.clean_network(network))
    event_queue = Queue()

    def get_events(shard_id):
        response = client.get_shard_iterator(
            StreamName=stream,
            ShardId=shard_id,
            ShardIteratorType='LATEST'
        )
        iterator = response.get("ShardIterator", "")
        while True:
            response = {}
            try:
                response = client.get_records(ShardIterator=iterator)
            except (ClientError, ParamValidationError):
                time.sleep(2)
                continue
            iterator = response.get("NextShardIterator", "")
            records = response.get("Records", [])
            for record in records:
                event_data = record.get("Data")
                if event_data is not None:
                    try:
                        event_data = json.loads(event_data)
                    except ValueError:
                        continue
                    event_queue.put(event_data)

    workers = []
    description = {"HasMoreShards": True}
    while description.get("HasMoreShards", False):
        response = client.describe_stream(StreamName=stream)
        description = response.get('StreamDescription', {})
        shards = description.get('Shards', [])
        for shard in shards:
            worker = threading.Thread(target=get_events, args=(shard.get("ShardId"),))
            worker.daemon = True
            worker.start()
            workers.append(worker)

    while True:
        event = None
        try:
            event = event_queue.get(block=False, timeout=.01)
        except Empty:
            pass
        if event is not None:
            yield event
コード例 #22
0
def start_zeo_server(storage_conf=None,
                     zeo_conf=None,
                     port=None,
                     keep=False,
                     path='Data.fs',
                     protocol=None,
                     blob_dir=None,
                     suicide=True,
                     debug=False,
                     threaded=False,
                     start_timeout=33,
                     name=None,
                     log=None,
                     show_config=False):
    """Start a ZEO server in a separate process.

    Takes two positional arguments a string containing the storage conf
    and a ZEOConfig object.

    Returns the ZEO address, the test server address, the pid, and the path
    to the config file.
    """

    if not storage_conf:
        storage_conf = '<filestorage>\npath %s\n</filestorage>' % path

    if blob_dir:
        storage_conf = '<blobstorage>\nblob-dir %s\n%s\n</blobstorage>' % (
            blob_dir, storage_conf)

    if zeo_conf is None or isinstance(zeo_conf, dict):
        if port is None:
            port = 0

        if isinstance(port, int):
            addr = '127.0.0.1', port
        else:
            addr = port

        z = ZEOConfig(addr, log=log)
        if zeo_conf:
            z.__dict__.update(zeo_conf)
        zeo_conf = str(z)

    zeo_conf = str(zeo_conf) + '\n\n' + storage_conf
    if show_config:
        print(zeo_conf)

    # Store the config info in a temp file.
    tmpfile = tempfile.mktemp(".conf", dir=os.getcwd())
    fp = open(tmpfile, 'w')
    fp.write(zeo_conf)
    fp.close()

    if threaded:
        from threading import Thread
        from six.moves.queue import Queue
    else:
        from multiprocessing import Process as Thread
        Queue = ThreadlessQueue

    qin = Queue()
    qout = Queue()
    thread = Thread(
        target=runner,
        args=[tmpfile, qin, qout, 999 if suicide else None],
        kwargs=dict(debug=debug, name=name, protocol=protocol, keep=keep),
        name=None if name is None else name + '-server-runner',
    )
    thread.daemon = True
    thread.start()
    try:
        addr = qout.get(timeout=start_timeout)
    except Exception:
        whine("SERVER FAILED TO START")
        if thread.is_alive():
            whine("Server thread/process is still running")
        elif not threaded:
            whine("Exit status", thread.exitcode)
        raise

    def stop(stop_timeout=99):
        stop_runner(thread, tmpfile, qin, qout, stop_timeout)

    return addr, stop
コード例 #23
0
ファイル: log.py プロジェクト: zjjxxlgb/patroni
 def __init__(self):
     logging.Handler.__init__(self)
     self.queue = Queue()
     self._records_lost = 0
コード例 #24
0
ファイル: settings_view.py プロジェクト: wangyu78/piksi_tools
 def __init__(self, settings_view):
     self._settings_view = settings_view
     self._work_queue = Queue()
     self._worker = threading.Thread(target=self._work_thd)
     self._worker.daemon = True
     self._worker.start()
コード例 #25
0
 def eventqueue(self, scheduler):
     from gevent.queue import Queue
     events = Queue()
     scheduler.add_listener(events.put)
     return events
コード例 #26
0
 def __init__(self):
     self.received = Queue()
コード例 #27
0
def test_instances_deployed(mock_get_paasta_api_client, mock__log):
    mock_paasta_api_client = Mock()
    mock_get_paasta_api_client.return_value = mock_paasta_api_client
    mock_paasta_api_client.service.status_instance.side_effect = \
        mock_status_instance_side_effect

    f = mark_for_deployment.instances_deployed
    e = Event()
    e.set()
    cluster_data = mark_for_deployment.ClusterData(cluster='cluster',
                                                   service='service1',
                                                   git_sha='somesha',
                                                   instances_queue=Queue())
    cluster_data.instances_queue.put('instance1')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.empty()

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('instance1')
    cluster_data.instances_queue.put('instance2')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.get(block=True) == 'instance2'

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('instance3')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.get(block=True) == 'instance3'

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('instance4')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.get(block=True) == 'instance4'

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('instance5')
    cluster_data.instances_queue.put('instance1')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.empty()

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('instance6')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.get(block=False) == 'instance6'

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('notaninstance')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.get(block=False) == 'notaninstance'

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('api_error')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.get(block=False) == 'api_error'

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('instance7')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.empty()

    cluster_data.instances_queue = Queue()
    cluster_data.instances_queue.put('instance8')
    instances_out = Queue()
    f(cluster_data, instances_out, e)
    assert cluster_data.instances_queue.empty()
    assert instances_out.empty()
コード例 #28
0
ファイル: detection_test.py プロジェクト: zhuMingXu/simpledet
    sym = pModel.test_symbol
    sym.save(pTest.model.prefix + "_test.json")

    image_sets = pDataset.image_set
    roidbs_all = [
        pkl.load(open("data/cache/{}.roidb".format(i), "rb"),
                 encoding="latin1") for i in image_sets
    ]
    roidbs_all = reduce(lambda x, y: x + y, roidbs_all)

    from pycocotools.coco import COCO
    from pycocotools.cocoeval import COCOeval
    coco = COCO(pTest.coco.annotation)

    data_queue = Queue(100)
    result_queue = Queue()

    execs = []
    workers = []
    coco_result = []
    split_size = 1000

    for index_split in range(int(math.ceil(len(roidbs_all) / split_size))):
        print("evaluating [%d, %d)" % (index_split * split_size,
                                       (index_split + 1) * split_size))
        roidb = roidbs_all[index_split * split_size:(index_split + 1) *
                           split_size]
        roidb = pTest.process_roidb(roidb)
        for i, x in enumerate(roidb):
            x["rec_id"] = i
コード例 #29
0
def daemon(args,
           restart=1,
           first_line=None,
           stream=True,
           timeout=5,
           buffer_size='line',
           **kwargs):
    '''
    This is the same as :py:class:`Subprocess`, but has a few additional checks.

    1. If we have already called :py:class:`Subprocess` with the same arguments,
       re-use the same instance.
    2. Send the process STDOUT and STDERR to this application's STDERR. This
       makes it easy to see what errors the application reports.
    3. Supports retry attempts.
    4. Checks if the first line of output is a matches a string / re -- ensuring
       that the application started properly.
    '''
    arg_str = args if isinstance(args, six.string_types) else ' '.join(args)
    try:
        key = cache_key(arg_str, kwargs)
    except (TypeError, ValueError):
        app_log.error('daemon args must be JSON serializable')
        raise
    # Send the stdout and stderr to (a) stderr AND to (b) a local queue we read
    queue = Queue(maxsize=10)
    for channel in ('stream_stdout', 'stream_stderr'):
        if channel not in kwargs:
            kwargs[channel] = []
        elif not isinstance(kwargs[channel], list):
            kwargs[channel] = [kwargs[channel]]
        if first_line:
            kwargs[channel].append(queue.put)
        if stream is True:
            kwargs[channel].append(_stderr_write)
        elif callable(stream):
            kwargs[channel].append(stream)
    # Buffer by line by default. This is required for the first_line check, not otherwise.
    kwargs['buffer_size'] = buffer_size
    # started is set if we actually call Subprocess as part of this function
    started = False

    # If process was never started, start it
    if key not in _daemons:
        started = _daemons[key] = Subprocess(args, **kwargs)

    # Ensure that process is running. Restart if required
    proc = _daemons[key]
    restart = int(restart)
    while proc.proc.returncode is not None and restart > 0:
        restart -= 1
        proc = started = _daemons[key] = Subprocess(args, **kwargs)
    if proc.proc.returncode is not None:
        raise RuntimeError('Error %d starting %s' %
                           (proc.proc.returncode, arg_str))
    if started:
        app_log.info('Started: %s', arg_str)

    future = Future()
    # If process was started, wait until it has initialized. Else just return the proc
    if first_line and started:
        if isinstance(first_line, six.string_types):

            def check(proc):
                actual = queue.get(timeout=timeout).decode('utf-8')
                if first_line not in actual:
                    raise AssertionError('%s: wrong first line: %s (no "%s")' %
                                         (arg_str, actual, first_line))
        elif isinstance(first_line, _regex_type):

            def check(proc):
                actual = queue.get(timeout=timeout).decode('utf-8')
                if not first_line.search(actual):
                    raise AssertionError('%s: wrong first line: %s' %
                                         (arg_str, actual))
        elif callable(first_line):
            check = first_line
        loop = _get_current_ioloop()

        def checker(proc):
            try:
                check(proc)
            except Exception as e:
                loop.add_callback(future.set_exception, e)
            else:
                loop.add_callback(future.set_result, proc)

        proc._check_thread = t = Thread(target=checker, args=(proc, ))
        t.daemon = True  # Thread dies with the program
        t.start()
    else:
        future.set_result(proc)
    return future
コード例 #30
0
ファイル: gadgetfs_phy.py プロジェクト: webstorage119/umap2
 def __init__(self, phy, ep):
     super(InEpThread, self).__init__(phy, ep)
     self.queue = Queue()