Example #1
0
    def __init__(self, env_fns, engine):
        # TODO: sharing cuda tensors requires spawn or forkserver but these do not work with mpi
        # mp.set_start_method('spawn')
        self.engine = engine

        self.waiting = False
        self.closed = False
        self.nb_env = len(env_fns)

        self.remotes, self.work_remotes = zip(*[mp.Pipe() for _ in range(self.nb_env)])
        self.ps = [mp.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
                   for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
        for p in self.ps:
            p.daemon = True  # if the main process crashes, we should not cause things to hang
            p.start()
        for remote in self.work_remotes:
            remote.close()

        self.remotes[0].send(('get_spaces', None))
        self._observation_space, self._action_space = self.remotes[0].recv()
        self.remotes[0].send(('get_processors', None))
        self._cpu_preprocessor, self._gpu_preprocessor = self.remotes[0].recv()

        shared_memories = []
        for remote in self.remotes:
            remote.send(('get_shared_memory', None))
            shared_memories.append(remote.recv())
        self.shared_memories = listd_to_dlist(shared_memories)
Example #2
0
    def load_model(self, model: Model, state: ModelState,
                   devices: list) -> RPCFuture[SetDeviceReturnType]:
        log_dir = model.config.get(LOGGING, {}).get(DIRECTORY, "")
        if log_dir:
            os.makedirs(log_dir, exist_ok=True)
            self.logger.info("log dir: %s", os.path.abspath(log_dir))

        self._start_logging_handler()
        incomplete_msg = get_error_msg_for_incomplete_config(model.config)
        if incomplete_msg:
            raise ValueError(incomplete_msg)

        # todo: move test_transforms elsewhere
        self.test_transforms = model.config.get(TESTING, {}).get(
            TRANSFORMS, {"Normalize": {}})

        if not devices:
            devices = ["cpu"]

        cuda_visible_devices, handler_devices = self.get_cuda_and_handler_device_names(
            devices)

        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_visible_devices)
        self.logger.info("Set CUDA_VISIBLE_DEVICES to '%s'",
                         os.environ["CUDA_VISIBLE_DEVICES"])

        server_conn, handler_conn = mp.Pipe()
        p = mp.Process(
            target=run_handler,
            name="Handler",
            kwargs={
                "conn": handler_conn,
                "config": model.config,
                "model_file": model.code,
                "model_state": state.model_state,
                "optimizer_state": state.optimizer_state,
                "log_queue": self.log_queue,
            },
        )
        try:
            p.start()
        except Exception as e:
            self.logger.error(e)
            err_fut = RPCFuture()
            err_fut.set_exception(e)
            return err_fut
        else:
            self.handler = create_client(IHandler, server_conn)
            try:
                tik_fut = self.handler.set_devices(handler_devices)
            except Exception as e:
                self.logger.exception("set_devices failed")
                err_fut = RPCFuture()
                err_fut.set_exception(e)
                return err_fut
            else:
                self.logger.info("got tik_fut")
                fut = tik_fut.map(convert_to_SetDeviceReturnType)
                self.logger.info("converted tik_fut")
                return fut
Example #3
0
def start(jobs: List[Job],
          cuda_device_ids: Union[List[int], List[List[int]]],
          order=True):
    """
    Start the given jobs on the list of cuda devices.
    Args:
        jobs (List[Job]): List of jobs to start.
        cuda_device_ids (Union[List[int], List[List[int]]]):
            List of available cuda devices on which the jobs will be distributed.
            The list can either consist of single devices ([0, 1, 2, 3]) or device pairs
            ([[0, 1], [2, 3]]).
    """
    assert len(jobs) > 0, "No jobs specified."
    assert len(cuda_device_ids) > 0, "No cuda devices specified."
    stdout = sys.__stdout__
    #mp.set_start_method('spawn') # THIS SHOULD BE IN THE MAIN FILE
    # Create queue
    logfile = 'multip.out'
    with open(logfile, "a") as f:
        print('\n', file=f)
        print('############## NEW SESSION ###############', file=f)
        print('DATE: ', datetime.datetime.now(), file=f)
        print('\n', file=f)
    queue = mp.Queue(maxsize=len(cuda_device_ids))

    # Put all devices into the queue
    for id in cuda_device_ids:
        queue.put(id)
    # Create processes
    processes, connections = [], []
    for i, job in enumerate(jobs):
        f = Target(job, logfile=logfile)
        parent_conn, child_conn = mp.Pipe()
        connections.append(parent_conn)
        p = mp.Process(target=f, args=(
            queue,
            child_conn,
        ))
        processes.append(p)
    # Start processes
    for i, p in enumerate(processes):
        p.start()
        with open(logfile, "a") as f:
            print(f"{i} Process {p.name} created, PID:{p.pid}", file=f)
    with open(logfile, "a") as f:
        print('- - - - -   Start Running - - - - -', file=f)
    for i, connection in enumerate(sysutil.progbar(connections)):
        cuda_device_id = queue.get()
        with open(logfile, "a") as f:
            print(
                f"{i} Process {processes[i].name} started, PID:{processes[i].pid}, CUDA:{cuda_device_id}",
                file=f)
        sys.stdout.flush()
        connection.send(cuda_device_id)

    # Join processes
    for p in processes:  #sysutil.progbar(processes):
        p.join()
    with open(logfile, "a") as f:
        print('############## SESSION ENDED ###############', file=f)
Example #4
0
 def __init__(self, memory_size, batch_size):
     mp.Process.__init__(self)
     self.pipe, self.worker_pipe = mp.Pipe()
     self.memory_size = memory_size
     self.batch_size = batch_size
     self.cache_len = 2
     self.start()
Example #5
0
 def __init__(self, cls, *args, **kwargs):
     multip.Process.__init__(self)
     self.cls = cls
     self.args = args
     self.kwargs = kwargs
     self.pipe, self.worker_pipe = multip.Pipe()
     self.daemon = True
     self.start()
Example #6
0
    def init_network(self):
        for p_idx in range(self.num_processes):
            parent_conn, child_conn = mp.Pipe()
            self.parent_conns.append(parent_conn)
            self.child_conns.append(child_conn)

            process = mp.Process(target=self.step_task, args=(child_conn, self.tasks[p_idx]))
            self.processes.append(process)
Example #7
0
 def __init__(self, memory_size, batch_size, replay_type='default'):
     mp.Process.__init__(self)
     self.pipe, self.worker_pipe = mp.Pipe()
     self.memory_size = memory_size
     self.batch_size = batch_size
     self.cache_len = 2
     self.replay_type = replay_type
     self.start()
Example #8
0
def main():
    timestamp('frontend', 'start')

    # Load model list
    model_list_file_name = sys.argv[1]
    model_list = []
    with open(model_list_file_name) as f:
        for line in f.readlines():
            model_list.append(line.strip())

    # Warm up CUDA and allocate shared cache
    torch.randn(1024, device='cuda')
    torch.cuda.allocate_shared_cache()

    # Create workers
    num_workers = 2
    worker_list = []
    for _ in range(num_workers):
        p_parent, p_child = mp.Pipe()
        param_trans_parent, param_trans_child = mp.Pipe()
        term_parent, term_child = mp.Pipe()
        worker = WorkerProc(model_list, p_child, param_trans_child, term_child)
        worker.start()
        torch.cuda.send_shared_cache()
        worker_list.append((p_parent, worker, param_trans_parent, term_parent))
        timestamp('frontend', 'create_worker')

    # Create request queue and scheduler thread
    requests_queue = Queue()
    t_sch = FrontendScheduleThd(model_list, requests_queue, worker_list)
    t_sch.start()
    timestamp('frontend', 'start_schedule')

    # Accept connections
    server = TcpServer('localhost', 12345)
    timestamp('tcp', 'listen')
    while True:
        conn, _ = server.accept()
        agent = TcpAgent(conn)
        timestamp('tcp', 'connected')
        t_tcp = FrontendTcpThd(requests_queue, agent)
        t_tcp.start()

    # Wait for end
    t_sch.join()
Example #9
0
 def __init__(self, memory_size, batch_size, prioritize=False, alpha=0.5):
     mp.Process.__init__(self)
     self.pipe, self.worker_pipe = mp.Pipe()
     self.memory_size = memory_size
     self.batch_size = batch_size
     self.prioritize = prioritize
     self.alpha = alpha
     self.cache_len = 2
     self.start()
Example #10
0
 def start(self):
     self.conn, conn_child = mp.Pipe()
     self._process = mp.Process(target=self.worker,
                                args=(self.env_class,
                                      self._constructor_kwargs, conn_child))
     self._process.start()
     result = self.conn.recv()
     if isinstance(result, Exception):
         raise result
Example #11
0
    def __init__(self, constructor):

        self._conn, conn = multiprocessing.Pipe()
        self._process = multiprocessing.Process(target=self._worker,
                                                args=(constructor, conn))
        atexit.register(self.close)
        self._process.start()
        self._states_shape = None
        self._observs_shape = None
Example #12
0
def run_parameter_sweep(parameters,data,args,Beta):
    output = []
    num_processes = torch.cuda.device_count()
    batches = int(len(parameters) / num_processes)
    idx = 0
    objectives = []
    bdivs = []
    val_objectives = []
    val_bdivs = []
    nsigs = []
    times = []
    while idx <= len(parameters)-num_processes:
        print(idx)
        pipe_list = []
        processes = []
        for rank in range(num_processes):
            recv_end, send_end = mp.Pipe(False)
            p = mp.Process(target=run_method_engine, args=(data, parameters.iloc[idx+rank]['a'], parameters.iloc[idx+rank]['phi'], parameters.iloc[idx+rank]['b'], Beta,
                                                   args.prior_on_W, args.prior_on_H, parameters.iloc[idx+rank]['K0'], args.tolerance, args.max_iter, args.use_val_set, send_end, rank,))
            pipe_list.append(recv_end)
            processes.append(p)
            p.start()

        result_list = [x.recv() for x in pipe_list]
        for p in processes:
            p.join()
        nsig = [write_output(x[0],x[1],x[2],data.channel_names,data.sample_names,args.output_dir,
                      parameters['label'][idx+i]) for i,x in enumerate(result_list)]
        [nsigs.append(ns) for i,ns in enumerate(nsig)]
        [objectives.append(obj[3]) for i,obj in enumerate(result_list)]
        [bdivs.append(obj[4]) for i,obj in enumerate(result_list)]
        [val_objectives.append(obj[5]) for i,obj in enumerate(result_list)]
        [val_bdivs.append(obj[6]) for i,obj in enumerate(result_list)]
        [times.append(time[7]) for i,time in enumerate(result_list)]
        idx += num_processes

    if idx < len(parameters):
        for i in range(len(parameters)-idx):
            idx+=i
            print(idx)
            W,H,mask,cost,bdiv,val_cost,val_bdiv,time = run_method_engine(data, parameters.iloc[idx]['a'], parameters.iloc[idx]['phi'], parameters.iloc[idx]['b'], Beta,
                                                   args.prior_on_W, args.prior_on_H, parameters.iloc[idx]['K0'], args.tolerance, args.max_iter, args.use_val_set)
            nsig = write_output(W,H,mask,data.channel_names,data.sample_names,args.output_dir,
                      parameters['label'][idx])
            times.append(time)
            nsigs.append(nsig)
            objectives.append(cost)
            val_objectives.append(val_cost)
            bdivs.append(bdiv)
            val_bdivs.append(val_bdiv)
    parameters['nsigs'] = nsigs
    parameters['objective_trainset'] = objectives
    parameters['bdiv_trainset'] = bdivs
    parameters['objective_valset'] = val_objectives
    parameters['bdiv_valset'] = val_bdivs
    parameters['times'] = times
    parameters.to_csv(args.output_dir + '/parameters_with_results.txt',sep='\t',index=None)
Example #13
0
def train(args):
    num_workers = mp.cpu_count()
    print("Number of workers:", num_workers)

    net = learning.model.load_net('a3c')
    optim = torch.optim.Adam(net.parameters())

    start_sel = selectors.DefaultSelector()
    active = 0

    for i in range(num_workers):
        conn0, conn1 = mp.Pipe(True)
        mp.Process(target=worker, args=(conn1, i, args)).start()
        start_sel.register(conn0, selectors.EVENT_WRITE)
        active += 1

    iteration = 0

    running_sell = selectors.DefaultSelector()

    while active > 0:
        events = start_sel.select()

        for key, _ in events:
            conn = key.fileobj
            conn.send(net.state_dict())
            iteration += 1
            active -= 1

            start_sel.unregister(conn)
            running_sell.register(conn, selectors.EVENT_READ)

    active = num_workers

    while active > 0:
        print("Iteration:", iteration, "Active:", active)
        events = running_sell.select()

        for key, _ in events:
            conn = key.fileobj
            gradient = conn.recv()

            for param, grad in zip(net.parameters(), gradient):
                param.grad = grad

            optim.step()

            if iteration == args.rl_updates:
                conn.send(None)
                running_sell.unregister(conn)
                active -= 1
            else:
                conn.send(net.state_dict())
                iteration += 1

        learning.model.save_net(net, 'a3c')
Example #14
0
    def __init__(self, ReplicaClass, NumReplicas, MaxSamples, SwapInterval, MaxTemp, BetaLadderMethod = 'GEO'):
        
        """
        ReplicaClass : (class) The Class which inherits PTReplicaBaseClass and implements all the needed Abstract Functions.
        NumReplicas : (int) The number of Replicas to have for the algortihm.
        Maxsamples : (int) Maximum no. of Samples from each Replica.
        SwapInterval : (float) If < 1, Then it is the fraction of MaxSamples after which Swap Condition will be checked
                                        and if it's >= 1, then it is the SwapInterval (i.e. Number of Samples before checking for a swap)
        MaxTemp : Maximum Temperature for the Ladder.
        BetaLadderMethod : (str) The method by which the BetaLadder will be constructed. Currently supports 'GEO' for Geometric, 'LIN' for Linear, 'HAR' for Harmonic.
        """
        
        self.ReplicaClass = ReplicaClass
        self.NumReplicas = NumReplicas
        self.MaxSamples = MaxSamples
        self.MaxTemp = MaxTemp
        self.BetaLadderMethod = BetaLadderMethod

        #assert ((SwapInterval <= 1) and (SwapInterval >0)), "SwapInterval should be between 0 and 1"
        if ((SwapInterval < 1) and (SwapInterval > 0)):

            self.NumReplicaSamples = int(SwapInterval * MaxSamples)   #No.of iterations to run each Replica for in each iteration.

        else:

            assert isinstance(SwapInterval, int) == True, "If SwapInterval >= 1, then it should be of type integer."
            self.NumReplicaSamples = SwapInterval

        self.SwapInterval = SwapInterval


        print("Swap Condition will be tested every {} samples.".format(self.NumReplicaSamples))

        self.Temperatures = torch.tensor([1 for _ in range(NumReplicas)], dtype = torch.float64) #Placeholder for Temperatures.
        self.ReplicaList = [None for _ in range(NumReplicas)]

        ################################################################ [DEPRECATED, NOW USES PIPES TO COMMUNICATE BACK] #########################################################
        #SamplesQueueList is an Important Variable in this Implementation, it holds (in the following order) Model's Weights(and Biases), Miscellaneous Param List
        #for all the samples for each replica.
        #self.SamplesQueueList = [mp.Queue() for _ in range(NumReplicas)] 

        #Stores the Samples collected from the Last iteration where each chain collected NumReplicaSamples amount of samples
        self.LastRunSamplesAllReplicas = [ [] for _ in range(NumReplicas) ]


        self.SwapHistory = []

        #Pipes to transfer the Likelihood and Prior Probabilities back ######################### USE PIPES LATER ON TO TRANSFER EVERYTHING TO MAIN PROCESS
        self.PipeList = [mp.Pipe() for _ in range(NumReplicas)]

        #Have Replicas been Initialized? 
        self.isInitReplicaCalled = False
    

        #Final Samples Placeholder, it stores ALL MaxSamples amount of samples collected. It's updated only when Run
        self.AllSamples = [ [] for _ in range(NumReplicas) ]
 def _run_workers(self):
     assert len(self.workers) == 0
     for _ in range(self.num_workers):
         pipe_local, pipe_remote = mp.Pipe()
         p = mp.Process(target=generate_episodes_in_loop,
                        args=(self.env, self.player, pipe_remote,
                              self.zero_mode))
         p.start()
         self.workers.append(p)
         self.pipes.append(pipe_local)
Example #16
0
 def __init__(self,
              memory_size,
              batch_size,
              replay_type=Config.DEFAULT_REPLAY):
     mp.Process.__init__(self)
     self.pipe, self.worker_pipe = mp.Pipe()
     self.memory_size = memory_size
     self.batch_size = batch_size
     self.cache_len = 2
     self.replay_type = replay_type
     self.start()
Example #17
0
    def _inference(self, cand):
        # bn_statistic
        parent_conn, child_conn = mp.Pipe()
        args = dict({"local_rank": 0, "distributed": False})
        mp.spawn(bn_statistic,
                 nprocs=self.ngpus_per_node,
                 args=(self.ngpus_per_node, cfg, args, cand, child_conn))
        salt = parent_conn.recv()

        # fitness
        parent_conn, child_conn = mp.Pipe()
        args = dict({"local_rank": 0, "distributed": False})
        mp.spawn(fitness,
                 nprocs=self.ngpus_per_node,
                 args=(self.ngpus_per_node, cfg, args, cand, salt, child_conn))

        if os.path.isfile(os.path.join(cfg.OUTPUT_DIR, salt + ".pth")):
            os.remove(os.path.join(cfg.OUTPUT_DIR, salt + ".pth"))

        return parent_conn.recv()
Example #18
0
 def __init__(self, env, n_workers, seed=0):
     self.env = env
     self.seed = seed
     self.n_workers = n_workers
     self.pipes = [mp.Pipe() for rank in range(self.n_workers)]
     self.workers = [
         mp.Process(target=self.work, args=(rank, self.pipes[rank][1]))
         for rank in range(self.n_workers)
     ]
     [w.start() for w in self.workers]
     self.dones = {rank: False for rank in range(self.n_workers)}
Example #19
0
 def test_dist(self, model, cal_bn, valid_iter=-1, ckpt=''):
     self.args.do_test = True
     self.args.EXPER.resume = ckpt
     # self.args.ngpu=2
     parent_conn, child_conn = mp.Pipe()
     avail_port = pick_avail_port()
     # mp.spawn(main_worker,nprocs=self.args.ngpu,args=(self.args.ngpu,self.args,self.newmodel,child_conn,avail_port,cal_bn))
     mp.spawn(main_worker,
              nprocs=self.args.ngpu,
              args=(self.args.ngpu, self.args, model, child_conn,
                    avail_port, cal_bn, valid_iter))
     return parent_conn.recv()[0]
Example #20
0
 def __init__(self, task_fn, num_workers):
     self.task_fn = task_fn
     self.task = task_fn()
     self.name = self.task.name
     self.pipes, worker_pipes = zip(
         *[mp.Pipe() for _ in range(num_workers)])
     args = [(p, wp, task_fn) for p, wp in zip(self.pipes, worker_pipes)]
     self.workers = [mp.Process(target=sub_task, args=arg) for arg in args]
     for p in self.workers:
         p.start()
     for p in worker_pipes:
         p.close()
Example #21
0
 def __init__(self, level, num_envs, output_path=None):
     self.agent_conns, self.env_conns = zip(
         *[mp.Pipe() for _ in range(num_envs)])
     env = create_train_env(level, output_path=output_path)
     self.num_states = env.observation_space.shape[0]
     env.close()
     self.num_actions = len(ACTION_MAPPING)
     for index in range(num_envs):
         process = mp.Process(target=self.run,
                              args=(index, level, output_path))
         process.start()
         self.env_conns[index].close()
Example #22
0
    def __init__(self, envs):
        mp.set_start_method('spawn')
        self.master_ends = []
        self.waiting = False

        for env in envs:
            master_end, worker_end = mp.Pipe()
            proc = mp.Process(target=subproc_worker, args=(worker_end, master_end, env))
            proc.daemon = True
            self.master_ends.append(master_end)
            proc.start()
            worker_end.close()
Example #23
0
	def __init__(self, config):
		# super(WorkerManager, self).__init__()
		self.num_rank = config.num_rank
		self.network = config.network
		self.method = config.method
		self.channels = {}
		if self.method == 'pipe':
			for pair in self.network:
				if pair not in self.channels:
					self.channels[pair] = mp.Pipe(duplex=False)
		elif self.method == 'queue':
			for pair in network:
				self.channels[pair] = mp.SimpleQueue()
Example #24
0
 def __init__(self, replay_cls, replay_kwargs, async_=True):
     mp.Process.__init__(self)
     self.replay_kwargs = replay_kwargs
     self.replay_cls = replay_cls
     self.cache_len = 2
     if async_:
         self.pipe, self.worker_pipe = mp.Pipe()
         self.start()
     else:
         self.replay = replay_cls(**replay_kwargs)
         self.sample = self.replay.sample
         self.feed = self.replay.feed
         self.update_priorities = self.replay.update_priorities
Example #25
0
 def __init__(self, args, trainer_maker):
     self.args = args
     self.comms = []
     self.trainer = trainer_maker()
     # itself will do the same job as workers
     self.nworkers = args.nthreads - 1
     for i in range(self.nworkers):
         comm, comm_remote = mp.Pipe()
         self.comms.append(comm)
         worker = ThreadedWorker(i, trainer_maker, comm_remote)
         worker.start()
     self.grads = None
     self.worker_grads = None
Example #26
0
 def __init__(self, game, num_envs):
     self.agent_conns, self.env_conns = zip(
         *[mp.Pipe() for _ in range(num_envs)])
     # 创建多个游戏环境
     self.envs = [create_train_env(game) for _ in range(num_envs)]
     # 获取游戏图像的数量
     self.num_states = self.envs[0].observation_space.shape[0]
     # 获取动作的数量
     self.num_actions = self.envs[0].action_space.n
     # 启动多有效线程
     for index in range(num_envs):
         process = mp.Process(target=self.run, args=(index, ))
         process.start()
         self.env_conns[index].close()
Example #27
0
 def finetune_dist(self, savename='', load_last=False):
     assert savename != ''
     self.trainer.model = self.newmodel
     self.best_mAP = 0
     self.args.do_test = False
     self.args.EXPER.resume = savename if load_last else ''
     self.args.EXPER.save_ckpt = savename
     # self.args.ngpu=2
     parent_conn, child_conn = mp.Pipe()
     avail_port = pick_avail_port()
     mp.spawn(main_worker,
              nprocs=self.args.ngpu,
              args=(self.args.ngpu, self.args, self.newmodel, child_conn,
                    avail_port))
     return parent_conn.recv()[0]
Example #28
0
 def _calculate_max_pixel_value(self, num_process):
     """use multiprocessing to speed up the calculation of max_pixel_value
     for normalization"""
     pipes = []
     for i in range(num_process):
         recv_end, send_end = mp.Pipe(False)
         pipes.append(recv_end)
         p = mp.Process(target=self._update_max,
                        args=(range(
                            int(i * self.tot_frame / num_process),
                            int((i + 1) * self.tot_frame / num_process)),
                              send_end))
         p.daemon = True
         p.start()
     self.max_pixel_value = max(conn.recv() for conn in pipes)
Example #29
0
 def __init__(self, args, trainer_maker):
     #mp.set_start_method('spawn')
     self.comms = []
     self.trainer = trainer_maker()
     # itself will do the same job as workers
     self.nworkers = args.nprocesses - 1
     for i in range(self.nworkers):
         comm, comm_remote = mp.Pipe()
         self.comms.append(comm)
         worker = MultiProcessWorker(i, trainer_maker, comm_remote,
                                     args.seed)
         worker.start()
     self.grads = None
     self.worker_grads = None
     self.is_random = args.random
 def __init__(self,
              n_processes: int = 1,
              process_cls=None,
              cancel=None,
              verbose: str = False,
              regular_get: bool = True,
              tracker=None):
     store_attr(but='process_cls')
     self.process_cls = ifnone(process_cls, DataFitProcess)
     self.queue = mp.JoinableQueue(maxsize=self.n_processes)
     self.cancel = ifnone(self.cancel, mp.Event())
     self.pipe_in, self.pipe_out = mp.Pipe(False) if self.verbose else (
         None, None)
     self.cached_items = []
     self._place_holder_out = None
     self.step_idx = 0