def __init__(self, env_fns, engine): # TODO: sharing cuda tensors requires spawn or forkserver but these do not work with mpi # mp.set_start_method('spawn') self.engine = engine self.waiting = False self.closed = False self.nb_env = len(env_fns) self.remotes, self.work_remotes = zip(*[mp.Pipe() for _ in range(self.nb_env)]) self.ps = [mp.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] for p in self.ps: p.daemon = True # if the main process crashes, we should not cause things to hang p.start() for remote in self.work_remotes: remote.close() self.remotes[0].send(('get_spaces', None)) self._observation_space, self._action_space = self.remotes[0].recv() self.remotes[0].send(('get_processors', None)) self._cpu_preprocessor, self._gpu_preprocessor = self.remotes[0].recv() shared_memories = [] for remote in self.remotes: remote.send(('get_shared_memory', None)) shared_memories.append(remote.recv()) self.shared_memories = listd_to_dlist(shared_memories)
def load_model(self, model: Model, state: ModelState, devices: list) -> RPCFuture[SetDeviceReturnType]: log_dir = model.config.get(LOGGING, {}).get(DIRECTORY, "") if log_dir: os.makedirs(log_dir, exist_ok=True) self.logger.info("log dir: %s", os.path.abspath(log_dir)) self._start_logging_handler() incomplete_msg = get_error_msg_for_incomplete_config(model.config) if incomplete_msg: raise ValueError(incomplete_msg) # todo: move test_transforms elsewhere self.test_transforms = model.config.get(TESTING, {}).get( TRANSFORMS, {"Normalize": {}}) if not devices: devices = ["cpu"] cuda_visible_devices, handler_devices = self.get_cuda_and_handler_device_names( devices) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_visible_devices) self.logger.info("Set CUDA_VISIBLE_DEVICES to '%s'", os.environ["CUDA_VISIBLE_DEVICES"]) server_conn, handler_conn = mp.Pipe() p = mp.Process( target=run_handler, name="Handler", kwargs={ "conn": handler_conn, "config": model.config, "model_file": model.code, "model_state": state.model_state, "optimizer_state": state.optimizer_state, "log_queue": self.log_queue, }, ) try: p.start() except Exception as e: self.logger.error(e) err_fut = RPCFuture() err_fut.set_exception(e) return err_fut else: self.handler = create_client(IHandler, server_conn) try: tik_fut = self.handler.set_devices(handler_devices) except Exception as e: self.logger.exception("set_devices failed") err_fut = RPCFuture() err_fut.set_exception(e) return err_fut else: self.logger.info("got tik_fut") fut = tik_fut.map(convert_to_SetDeviceReturnType) self.logger.info("converted tik_fut") return fut
def start(jobs: List[Job], cuda_device_ids: Union[List[int], List[List[int]]], order=True): """ Start the given jobs on the list of cuda devices. Args: jobs (List[Job]): List of jobs to start. cuda_device_ids (Union[List[int], List[List[int]]]): List of available cuda devices on which the jobs will be distributed. The list can either consist of single devices ([0, 1, 2, 3]) or device pairs ([[0, 1], [2, 3]]). """ assert len(jobs) > 0, "No jobs specified." assert len(cuda_device_ids) > 0, "No cuda devices specified." stdout = sys.__stdout__ #mp.set_start_method('spawn') # THIS SHOULD BE IN THE MAIN FILE # Create queue logfile = 'multip.out' with open(logfile, "a") as f: print('\n', file=f) print('############## NEW SESSION ###############', file=f) print('DATE: ', datetime.datetime.now(), file=f) print('\n', file=f) queue = mp.Queue(maxsize=len(cuda_device_ids)) # Put all devices into the queue for id in cuda_device_ids: queue.put(id) # Create processes processes, connections = [], [] for i, job in enumerate(jobs): f = Target(job, logfile=logfile) parent_conn, child_conn = mp.Pipe() connections.append(parent_conn) p = mp.Process(target=f, args=( queue, child_conn, )) processes.append(p) # Start processes for i, p in enumerate(processes): p.start() with open(logfile, "a") as f: print(f"{i} Process {p.name} created, PID:{p.pid}", file=f) with open(logfile, "a") as f: print('- - - - - Start Running - - - - -', file=f) for i, connection in enumerate(sysutil.progbar(connections)): cuda_device_id = queue.get() with open(logfile, "a") as f: print( f"{i} Process {processes[i].name} started, PID:{processes[i].pid}, CUDA:{cuda_device_id}", file=f) sys.stdout.flush() connection.send(cuda_device_id) # Join processes for p in processes: #sysutil.progbar(processes): p.join() with open(logfile, "a") as f: print('############## SESSION ENDED ###############', file=f)
def __init__(self, memory_size, batch_size): mp.Process.__init__(self) self.pipe, self.worker_pipe = mp.Pipe() self.memory_size = memory_size self.batch_size = batch_size self.cache_len = 2 self.start()
def __init__(self, cls, *args, **kwargs): multip.Process.__init__(self) self.cls = cls self.args = args self.kwargs = kwargs self.pipe, self.worker_pipe = multip.Pipe() self.daemon = True self.start()
def init_network(self): for p_idx in range(self.num_processes): parent_conn, child_conn = mp.Pipe() self.parent_conns.append(parent_conn) self.child_conns.append(child_conn) process = mp.Process(target=self.step_task, args=(child_conn, self.tasks[p_idx])) self.processes.append(process)
def __init__(self, memory_size, batch_size, replay_type='default'): mp.Process.__init__(self) self.pipe, self.worker_pipe = mp.Pipe() self.memory_size = memory_size self.batch_size = batch_size self.cache_len = 2 self.replay_type = replay_type self.start()
def main(): timestamp('frontend', 'start') # Load model list model_list_file_name = sys.argv[1] model_list = [] with open(model_list_file_name) as f: for line in f.readlines(): model_list.append(line.strip()) # Warm up CUDA and allocate shared cache torch.randn(1024, device='cuda') torch.cuda.allocate_shared_cache() # Create workers num_workers = 2 worker_list = [] for _ in range(num_workers): p_parent, p_child = mp.Pipe() param_trans_parent, param_trans_child = mp.Pipe() term_parent, term_child = mp.Pipe() worker = WorkerProc(model_list, p_child, param_trans_child, term_child) worker.start() torch.cuda.send_shared_cache() worker_list.append((p_parent, worker, param_trans_parent, term_parent)) timestamp('frontend', 'create_worker') # Create request queue and scheduler thread requests_queue = Queue() t_sch = FrontendScheduleThd(model_list, requests_queue, worker_list) t_sch.start() timestamp('frontend', 'start_schedule') # Accept connections server = TcpServer('localhost', 12345) timestamp('tcp', 'listen') while True: conn, _ = server.accept() agent = TcpAgent(conn) timestamp('tcp', 'connected') t_tcp = FrontendTcpThd(requests_queue, agent) t_tcp.start() # Wait for end t_sch.join()
def __init__(self, memory_size, batch_size, prioritize=False, alpha=0.5): mp.Process.__init__(self) self.pipe, self.worker_pipe = mp.Pipe() self.memory_size = memory_size self.batch_size = batch_size self.prioritize = prioritize self.alpha = alpha self.cache_len = 2 self.start()
def start(self): self.conn, conn_child = mp.Pipe() self._process = mp.Process(target=self.worker, args=(self.env_class, self._constructor_kwargs, conn_child)) self._process.start() result = self.conn.recv() if isinstance(result, Exception): raise result
def __init__(self, constructor): self._conn, conn = multiprocessing.Pipe() self._process = multiprocessing.Process(target=self._worker, args=(constructor, conn)) atexit.register(self.close) self._process.start() self._states_shape = None self._observs_shape = None
def run_parameter_sweep(parameters,data,args,Beta): output = [] num_processes = torch.cuda.device_count() batches = int(len(parameters) / num_processes) idx = 0 objectives = [] bdivs = [] val_objectives = [] val_bdivs = [] nsigs = [] times = [] while idx <= len(parameters)-num_processes: print(idx) pipe_list = [] processes = [] for rank in range(num_processes): recv_end, send_end = mp.Pipe(False) p = mp.Process(target=run_method_engine, args=(data, parameters.iloc[idx+rank]['a'], parameters.iloc[idx+rank]['phi'], parameters.iloc[idx+rank]['b'], Beta, args.prior_on_W, args.prior_on_H, parameters.iloc[idx+rank]['K0'], args.tolerance, args.max_iter, args.use_val_set, send_end, rank,)) pipe_list.append(recv_end) processes.append(p) p.start() result_list = [x.recv() for x in pipe_list] for p in processes: p.join() nsig = [write_output(x[0],x[1],x[2],data.channel_names,data.sample_names,args.output_dir, parameters['label'][idx+i]) for i,x in enumerate(result_list)] [nsigs.append(ns) for i,ns in enumerate(nsig)] [objectives.append(obj[3]) for i,obj in enumerate(result_list)] [bdivs.append(obj[4]) for i,obj in enumerate(result_list)] [val_objectives.append(obj[5]) for i,obj in enumerate(result_list)] [val_bdivs.append(obj[6]) for i,obj in enumerate(result_list)] [times.append(time[7]) for i,time in enumerate(result_list)] idx += num_processes if idx < len(parameters): for i in range(len(parameters)-idx): idx+=i print(idx) W,H,mask,cost,bdiv,val_cost,val_bdiv,time = run_method_engine(data, parameters.iloc[idx]['a'], parameters.iloc[idx]['phi'], parameters.iloc[idx]['b'], Beta, args.prior_on_W, args.prior_on_H, parameters.iloc[idx]['K0'], args.tolerance, args.max_iter, args.use_val_set) nsig = write_output(W,H,mask,data.channel_names,data.sample_names,args.output_dir, parameters['label'][idx]) times.append(time) nsigs.append(nsig) objectives.append(cost) val_objectives.append(val_cost) bdivs.append(bdiv) val_bdivs.append(val_bdiv) parameters['nsigs'] = nsigs parameters['objective_trainset'] = objectives parameters['bdiv_trainset'] = bdivs parameters['objective_valset'] = val_objectives parameters['bdiv_valset'] = val_bdivs parameters['times'] = times parameters.to_csv(args.output_dir + '/parameters_with_results.txt',sep='\t',index=None)
def train(args): num_workers = mp.cpu_count() print("Number of workers:", num_workers) net = learning.model.load_net('a3c') optim = torch.optim.Adam(net.parameters()) start_sel = selectors.DefaultSelector() active = 0 for i in range(num_workers): conn0, conn1 = mp.Pipe(True) mp.Process(target=worker, args=(conn1, i, args)).start() start_sel.register(conn0, selectors.EVENT_WRITE) active += 1 iteration = 0 running_sell = selectors.DefaultSelector() while active > 0: events = start_sel.select() for key, _ in events: conn = key.fileobj conn.send(net.state_dict()) iteration += 1 active -= 1 start_sel.unregister(conn) running_sell.register(conn, selectors.EVENT_READ) active = num_workers while active > 0: print("Iteration:", iteration, "Active:", active) events = running_sell.select() for key, _ in events: conn = key.fileobj gradient = conn.recv() for param, grad in zip(net.parameters(), gradient): param.grad = grad optim.step() if iteration == args.rl_updates: conn.send(None) running_sell.unregister(conn) active -= 1 else: conn.send(net.state_dict()) iteration += 1 learning.model.save_net(net, 'a3c')
def __init__(self, ReplicaClass, NumReplicas, MaxSamples, SwapInterval, MaxTemp, BetaLadderMethod = 'GEO'): """ ReplicaClass : (class) The Class which inherits PTReplicaBaseClass and implements all the needed Abstract Functions. NumReplicas : (int) The number of Replicas to have for the algortihm. Maxsamples : (int) Maximum no. of Samples from each Replica. SwapInterval : (float) If < 1, Then it is the fraction of MaxSamples after which Swap Condition will be checked and if it's >= 1, then it is the SwapInterval (i.e. Number of Samples before checking for a swap) MaxTemp : Maximum Temperature for the Ladder. BetaLadderMethod : (str) The method by which the BetaLadder will be constructed. Currently supports 'GEO' for Geometric, 'LIN' for Linear, 'HAR' for Harmonic. """ self.ReplicaClass = ReplicaClass self.NumReplicas = NumReplicas self.MaxSamples = MaxSamples self.MaxTemp = MaxTemp self.BetaLadderMethod = BetaLadderMethod #assert ((SwapInterval <= 1) and (SwapInterval >0)), "SwapInterval should be between 0 and 1" if ((SwapInterval < 1) and (SwapInterval > 0)): self.NumReplicaSamples = int(SwapInterval * MaxSamples) #No.of iterations to run each Replica for in each iteration. else: assert isinstance(SwapInterval, int) == True, "If SwapInterval >= 1, then it should be of type integer." self.NumReplicaSamples = SwapInterval self.SwapInterval = SwapInterval print("Swap Condition will be tested every {} samples.".format(self.NumReplicaSamples)) self.Temperatures = torch.tensor([1 for _ in range(NumReplicas)], dtype = torch.float64) #Placeholder for Temperatures. self.ReplicaList = [None for _ in range(NumReplicas)] ################################################################ [DEPRECATED, NOW USES PIPES TO COMMUNICATE BACK] ######################################################### #SamplesQueueList is an Important Variable in this Implementation, it holds (in the following order) Model's Weights(and Biases), Miscellaneous Param List #for all the samples for each replica. #self.SamplesQueueList = [mp.Queue() for _ in range(NumReplicas)] #Stores the Samples collected from the Last iteration where each chain collected NumReplicaSamples amount of samples self.LastRunSamplesAllReplicas = [ [] for _ in range(NumReplicas) ] self.SwapHistory = [] #Pipes to transfer the Likelihood and Prior Probabilities back ######################### USE PIPES LATER ON TO TRANSFER EVERYTHING TO MAIN PROCESS self.PipeList = [mp.Pipe() for _ in range(NumReplicas)] #Have Replicas been Initialized? self.isInitReplicaCalled = False #Final Samples Placeholder, it stores ALL MaxSamples amount of samples collected. It's updated only when Run self.AllSamples = [ [] for _ in range(NumReplicas) ]
def _run_workers(self): assert len(self.workers) == 0 for _ in range(self.num_workers): pipe_local, pipe_remote = mp.Pipe() p = mp.Process(target=generate_episodes_in_loop, args=(self.env, self.player, pipe_remote, self.zero_mode)) p.start() self.workers.append(p) self.pipes.append(pipe_local)
def __init__(self, memory_size, batch_size, replay_type=Config.DEFAULT_REPLAY): mp.Process.__init__(self) self.pipe, self.worker_pipe = mp.Pipe() self.memory_size = memory_size self.batch_size = batch_size self.cache_len = 2 self.replay_type = replay_type self.start()
def _inference(self, cand): # bn_statistic parent_conn, child_conn = mp.Pipe() args = dict({"local_rank": 0, "distributed": False}) mp.spawn(bn_statistic, nprocs=self.ngpus_per_node, args=(self.ngpus_per_node, cfg, args, cand, child_conn)) salt = parent_conn.recv() # fitness parent_conn, child_conn = mp.Pipe() args = dict({"local_rank": 0, "distributed": False}) mp.spawn(fitness, nprocs=self.ngpus_per_node, args=(self.ngpus_per_node, cfg, args, cand, salt, child_conn)) if os.path.isfile(os.path.join(cfg.OUTPUT_DIR, salt + ".pth")): os.remove(os.path.join(cfg.OUTPUT_DIR, salt + ".pth")) return parent_conn.recv()
def __init__(self, env, n_workers, seed=0): self.env = env self.seed = seed self.n_workers = n_workers self.pipes = [mp.Pipe() for rank in range(self.n_workers)] self.workers = [ mp.Process(target=self.work, args=(rank, self.pipes[rank][1])) for rank in range(self.n_workers) ] [w.start() for w in self.workers] self.dones = {rank: False for rank in range(self.n_workers)}
def test_dist(self, model, cal_bn, valid_iter=-1, ckpt=''): self.args.do_test = True self.args.EXPER.resume = ckpt # self.args.ngpu=2 parent_conn, child_conn = mp.Pipe() avail_port = pick_avail_port() # mp.spawn(main_worker,nprocs=self.args.ngpu,args=(self.args.ngpu,self.args,self.newmodel,child_conn,avail_port,cal_bn)) mp.spawn(main_worker, nprocs=self.args.ngpu, args=(self.args.ngpu, self.args, model, child_conn, avail_port, cal_bn, valid_iter)) return parent_conn.recv()[0]
def __init__(self, task_fn, num_workers): self.task_fn = task_fn self.task = task_fn() self.name = self.task.name self.pipes, worker_pipes = zip( *[mp.Pipe() for _ in range(num_workers)]) args = [(p, wp, task_fn) for p, wp in zip(self.pipes, worker_pipes)] self.workers = [mp.Process(target=sub_task, args=arg) for arg in args] for p in self.workers: p.start() for p in worker_pipes: p.close()
def __init__(self, level, num_envs, output_path=None): self.agent_conns, self.env_conns = zip( *[mp.Pipe() for _ in range(num_envs)]) env = create_train_env(level, output_path=output_path) self.num_states = env.observation_space.shape[0] env.close() self.num_actions = len(ACTION_MAPPING) for index in range(num_envs): process = mp.Process(target=self.run, args=(index, level, output_path)) process.start() self.env_conns[index].close()
def __init__(self, envs): mp.set_start_method('spawn') self.master_ends = [] self.waiting = False for env in envs: master_end, worker_end = mp.Pipe() proc = mp.Process(target=subproc_worker, args=(worker_end, master_end, env)) proc.daemon = True self.master_ends.append(master_end) proc.start() worker_end.close()
def __init__(self, config): # super(WorkerManager, self).__init__() self.num_rank = config.num_rank self.network = config.network self.method = config.method self.channels = {} if self.method == 'pipe': for pair in self.network: if pair not in self.channels: self.channels[pair] = mp.Pipe(duplex=False) elif self.method == 'queue': for pair in network: self.channels[pair] = mp.SimpleQueue()
def __init__(self, replay_cls, replay_kwargs, async_=True): mp.Process.__init__(self) self.replay_kwargs = replay_kwargs self.replay_cls = replay_cls self.cache_len = 2 if async_: self.pipe, self.worker_pipe = mp.Pipe() self.start() else: self.replay = replay_cls(**replay_kwargs) self.sample = self.replay.sample self.feed = self.replay.feed self.update_priorities = self.replay.update_priorities
def __init__(self, args, trainer_maker): self.args = args self.comms = [] self.trainer = trainer_maker() # itself will do the same job as workers self.nworkers = args.nthreads - 1 for i in range(self.nworkers): comm, comm_remote = mp.Pipe() self.comms.append(comm) worker = ThreadedWorker(i, trainer_maker, comm_remote) worker.start() self.grads = None self.worker_grads = None
def __init__(self, game, num_envs): self.agent_conns, self.env_conns = zip( *[mp.Pipe() for _ in range(num_envs)]) # 创建多个游戏环境 self.envs = [create_train_env(game) for _ in range(num_envs)] # 获取游戏图像的数量 self.num_states = self.envs[0].observation_space.shape[0] # 获取动作的数量 self.num_actions = self.envs[0].action_space.n # 启动多有效线程 for index in range(num_envs): process = mp.Process(target=self.run, args=(index, )) process.start() self.env_conns[index].close()
def finetune_dist(self, savename='', load_last=False): assert savename != '' self.trainer.model = self.newmodel self.best_mAP = 0 self.args.do_test = False self.args.EXPER.resume = savename if load_last else '' self.args.EXPER.save_ckpt = savename # self.args.ngpu=2 parent_conn, child_conn = mp.Pipe() avail_port = pick_avail_port() mp.spawn(main_worker, nprocs=self.args.ngpu, args=(self.args.ngpu, self.args, self.newmodel, child_conn, avail_port)) return parent_conn.recv()[0]
def _calculate_max_pixel_value(self, num_process): """use multiprocessing to speed up the calculation of max_pixel_value for normalization""" pipes = [] for i in range(num_process): recv_end, send_end = mp.Pipe(False) pipes.append(recv_end) p = mp.Process(target=self._update_max, args=(range( int(i * self.tot_frame / num_process), int((i + 1) * self.tot_frame / num_process)), send_end)) p.daemon = True p.start() self.max_pixel_value = max(conn.recv() for conn in pipes)
def __init__(self, args, trainer_maker): #mp.set_start_method('spawn') self.comms = [] self.trainer = trainer_maker() # itself will do the same job as workers self.nworkers = args.nprocesses - 1 for i in range(self.nworkers): comm, comm_remote = mp.Pipe() self.comms.append(comm) worker = MultiProcessWorker(i, trainer_maker, comm_remote, args.seed) worker.start() self.grads = None self.worker_grads = None self.is_random = args.random
def __init__(self, n_processes: int = 1, process_cls=None, cancel=None, verbose: str = False, regular_get: bool = True, tracker=None): store_attr(but='process_cls') self.process_cls = ifnone(process_cls, DataFitProcess) self.queue = mp.JoinableQueue(maxsize=self.n_processes) self.cancel = ifnone(self.cancel, mp.Event()) self.pipe_in, self.pipe_out = mp.Pipe(False) if self.verbose else ( None, None) self.cached_items = [] self._place_holder_out = None self.step_idx = 0