def start_worker(self): self.workers = [] self.shared_que = self.manager.Queue(self.worker_nums) # TODO (chongyi zheng): why plus one? self.start_barrier = mp.Barrier(self.worker_nums + 1) self.eval_workers = [] self.eval_shared_que = self.manager.Queue(self.eval_worker_nums) # TODO (chongyi zheng): why plus one? self.eval_start_barrier = mp.Barrier(self.eval_worker_nums + 1) self.env_info.env_cls = self.env_cls self.env_info.env_args = self.env_args for i in range(self.worker_nums): self.env_info.env_rank = i p = mp.Process( target=self.__class__.train_worker_process, args=(self.__class__, self.shared_funcs, self.env_info, self.replay_buffer, self.shared_que, self.start_barrier, self.train_epochs) ) # collect training data for `train_epochs` each worker p.start() self.workers.append(p) for i in range(self.eval_worker_nums): eval_p = mp.Process(target=self.__class__.eval_worker_process, args=(self.shared_funcs["pf"], self.env_info, self.eval_shared_que, self.eval_start_barrier, self.eval_epochs)) eval_p.start() self.eval_workers.append(eval_p)
def start_worker(self): self.workers = [] self.shared_que = self.manager.Queue(self.worker_nums) self.start_barrier = mp.Barrier(self.worker_nums) self.eval_workers = [] self.eval_shared_que = self.manager.Queue(self.eval_worker_nums) self.eval_start_barrier = mp.Barrier(self.eval_worker_nums) self.env_info.env_cls = self.env_cls self.env_info.env_args = self.env_args for i in range(self.worker_nums): self.env_info.env_rank = i p = mp.Process( target=self.__class__.train_worker_process, args=( self.__class__, self.shared_funcs, self.env_info, self.replay_buffer, self.shared_que, self.start_barrier, self.train_epochs)) p.start() self.workers.append(p) for i in range(self.eval_worker_nums): eval_p = mp.Process( target=self.__class__.eval_worker_process, args=(self.shared_funcs["pf"], self.env_info, self.eval_shared_que, self.eval_start_barrier, self.eval_epochs, self.reset_idx)) eval_p.start() self.eval_workers.append(eval_p)
def start_worker(self): self.workers = [] self.shared_que = self.manager.Queue() self.start_barrier = mp.Barrier(self.worker_nums + 1) self.terminate_mark = mp.Value('i', 0) self.eval_workers = [] self.eval_shared_que = self.manager.Queue() self.eval_start_barrier = mp.Barrier(self.eval_worker_nums + 1) for i in range(self.worker_nums): self.env_info.env_rank = i p = mp.Process(target=self.__class__.train_worker_process, args=(self.__class__, self.funcs, self.env_info, self.replay_buffer, self.shared_que, self.start_barrier, self.terminate_mark)) p.start() self.workers.append(p) for i in range(self.eval_worker_nums): eval_p = mp.Process(target=self.__class__.eval_worker_process, args=(self.pf, self.env_info, self.eval_shared_que, self.eval_start_barrier, self.terminate_mark, self.reset_idx)) eval_p.start() self.eval_workers.append(eval_p)
def mp_trainer(np, model, grad_buffer, optimizer, it_num=0): if np is None: print("can not get num of process!") sys.exit(-1) #np trainers and an optmizer Barrier = mp.Barrier(np + 1) Condition = mp.Condition() p_opt_args = (np, it_num, Barrier, optimizer, Condition, model, grad_buffer) p_opt = mp.Process(target=optimizer_process, args=p_opt_args) p_opt.start() processes = [] processes.append(p_opt) shared_score = torch.FloatTensor([0]) shared_score.share_memory_() for id in range(np): p_trainer_args = (id, it_num, Barrier, optimizer, Condition, model, grad_buffer, shared_score, np) p_trainer = mp.Process(target=trainer_process, args=p_trainer_args) p_trainer.start() processes.append(p_trainer) for p in processes: p.join()
def spawn_cpu_servers(p, bounds_net): # Create child processes to parallelize the last layer bounds computations over cpu. Uses multiprocessing. servers_queue = mp.Queue() instruction_queue = mp.Queue() barrier = mp.Barrier(p) cpu_servers = mp.spawn(last_bounds_cpu_server, args=(copy.deepcopy(bounds_net), servers_queue, instruction_queue, barrier), nprocs=(p - 1), join=False) return cpu_servers, servers_queue, instruction_queue, barrier
def start_worker(self): self.eval_workers = [] self.eval_shared_que = self.manager.Queue(self.eval_worker_nums) self.eval_start_barrier = mp.Barrier(self.eval_worker_nums) # task_cls, task_args, env_params tasks = list(self.env_cls.keys()) assert self.worker_nums == 0 assert self.eval_worker_nums == self.env.num_tasks self.env_info.env = None self.env_info.num_tasks = self.env.num_tasks self.env_info.env_cls = generate_single_mt_env single_mt_env_args = { "task_cls": None, "task_args": None, "env_rank": 0, "num_tasks": self.env.num_tasks, "max_obs_dim": np.prod(self.env.observation_space.shape), "env_params": self.env_args[0], "meta_env_params": self.env_args[2] } for i, task in enumerate(tasks): env_cls = self.env_cls[task] self.env_info.env_rank = i self.env_info.env_args = single_mt_env_args self.env_info.env_args["task_cls"] = env_cls self.env_info.env_args["task_args"] = copy.deepcopy( self.env_args[1][task]) start_epoch = 0 if "start_epoch" in self.env_info.env_args["task_args"]: # start_epoch = self.env_info.env_args["task_args"]["start_epoch"] del self.env_info.env_args["task_args"]["start_epoch"] # else: # start_epoch = 0 self.env_info.env_args["env_rank"] = i eval_p = mp.Process(target=self.__class__.eval_worker_process, args=(self.shared_funcs["pf"], self.env_info, self.eval_shared_que, self.eval_start_barrier, self.eval_epochs, start_epoch, task)) eval_p.start() self.eval_workers.append(eval_p)
def __init__(self, cfg): super(ParallelExplorer, self).__init__() # # This must be set in the main. # mp.set_start_method('forkserver') self.processes = [] self.comms = [] self.followup = [] self.replayBuffers = [] self.curThread = 0 self.nThreads = cfg.numEnv self.meanRewards = [-float('nan')] * self.nThreads self.numEps = [0] * self.nThreads self.nInBuffers = 0 self.totSteps = 0 self.maxBuffers = cfg.numFramesPerBuffer self.exploreSched = cfg.exploreSched self.model = cfg.model self.actionVec = torch.LongTensor(self.nThreads).zero_() self.actionVec.storage().share_memory_() self.threads = np.atleast_1d(np.arange(self.nThreads, dtype=np.int64)) self.toTensorImg, self.toTensor, self.use_cuda = TensorConfig.getTensorConfiguration( ) self.cfg = cfg self.barrier = mp.Barrier(self.nThreads + 1) # # How to sample. self.sampleFn = self._sampleRandom if cfg.sampleLatest: self.sampleFn = self._sampleLatest # # Sample from all threads. for idx in range(self.nThreads): print('Exploration: Actually set the seed properly.') sendP, subpipe = mp.Pipe() explorer = ExploreProcess(subpipe, cfg, idx, idx, self.actionVec, self.barrier) explorer.daemon = True explorer.start() self.processes.append(explorer) self.comms.append(sendP) self.replayBuffers.append( ReplayBuffer(cfg.numFramesPerBuffer, cfg.stackFrameLen)) self.followup.append(idx) self.nAct = self.processes[0].env.action_space.n self.imshape = self.processes[0].env.observation_space.shape print('Parent PID: %d' % os.getpid())
def __init__(self, config): super(TrainManager, self).__init__() device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') self.config = config self.training_config = self.config['training'] if self.training_config['transfer']: self.target_model = load_model(self.training_config['model_path']) else: self.target_model = create_model(self.config['model']) alt_model = None if self.training_config.get('init_actor', False): alt_model = load_model(self.training_config['model_path']) self.target_model.init_actor(alt_model) if self.training_config.get('init_critic', False): if alt_model is None: alt_model = load_model(self.training_config['model_path']) self.target_model.init_critic(alt_model) self._prime_model(self.target_model, device) self.models = [] self.proxy_models = [] for _ in range(self.training_config['num_threads_training']): model = copy.deepcopy(self.target_model) proxy_model = copy.deepcopy(self.target_model) self._prime_model(model, device) self._prime_model(proxy_model, device) self.models.append(model) self.proxy_models.append(proxy_model) self.processes = [] self.episode_queues = [ torch_mp.Queue(maxsize=128) for _ in range(self.training_config['num_threads_sampling']) ] self.sample_queues = [ torch_mp.Queue( maxsize=self.training_config['sampling_queue_max_len']) for _ in range(self.training_config['num_threads_training']) ] self.action_conns = [ torch_mp.Pipe(duplex=False) for _ in range( self.training_config['num_threads_exploring_virtual']) ] self.observation_conns = [ torch_mp.Pipe(duplex=False) for _ in range( self.training_config['num_threads_exploring_virtual']) ] self.observation_queue = torch_mp.Queue() self.action_queue = torch_mp.Queue() self.start_barrier = torch_mp.Barrier( self.training_config['num_threads_training']) self.finish_barrier = torch_mp.Barrier( self.training_config['num_threads_training']) self.update_lock = torch_mp.Lock() self.best_reward = Value('f', 0.0) self.global_episode = Value('i', 0) self.global_update_step = Value('i', 0)
'Number of tasks provided does not match the number of batch sizes provided.' ) n_gpus = int(args.n_gpus) n_tasks = len(tasks) * n_jobs shared_model = omninet.OmniNet(gpu_id=0) if restore != -1: shared_model.restore(model_save_path, restore) else: restore = 0 shared_model = shared_model.to(0) shared_model.share_memory() counters = [Counter(restore) for i in range(len(tasks))] barrier = mp.Barrier(n_tasks) start = int(restore / n_jobs) # Declare training processes for multi-gpu hogwild training processes = [] for i in range(n_tasks): #If more than one GPU is used, use first GPU only for model sharing if n_gpus > 1: gpu_id = i % n_gpus else: gpu_id = 0 process = mp.Process(target=train, args=(shared_model, tasks[i % len(tasks)], batch_sizes[i % len(tasks)], int(n_iters / n_jobs), gpu_id, start, restore, counters[i % len(tasks)], barrier, (save_interval if i == 0 else None),
def start_worker(args, logger): """Start kvclient for training """ init_time_start = time.time() time.sleep(WAIT_TIME) # wait for launch script server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config) args.machine_id = get_local_machine_id(server_namebook) dataset, entity_partition_book, local2global = get_partition_dataset( args.data_path, args.dataset, args.format, args.machine_id) n_entities = dataset.n_entities n_relations = dataset.n_relations print('Partition %d n_entities: %d' % (args.machine_id, n_entities)) print("Partition %d n_relations: %d" % (args.machine_id, n_relations)) entity_partition_book = F.tensor(entity_partition_book) relation_partition_book = get_long_tail_partition(dataset.n_relations, args.total_machine) relation_partition_book = F.tensor(relation_partition_book) local2global = F.tensor(local2global) relation_partition_book.share_memory_() entity_partition_book.share_memory_() local2global.share_memory_() train_data = TrainDataset(dataset, args, ranks=args.num_client) # if there is no cross partition relaiton, we fall back to strict_rel_part args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part == False) args.soft_rel_part = args.mix_cpu_gpu and args.soft_rel_part and train_data.cross_part if args.neg_sample_size_eval < 0: args.neg_sample_size_eval = dataset.n_entities args.batch_size = get_compatible_batch_size(args.batch_size, args.neg_sample_size) args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval, args.neg_sample_size_eval) args.num_workers = 8 # fix num_workers to 8 train_samplers = [] for i in range(args.num_client): train_sampler_head = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='head', num_workers=args.num_workers, shuffle=True, exclude_positive=False, rank=i) train_sampler_tail = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='tail', num_workers=args.num_workers, shuffle=True, exclude_positive=False, rank=i) train_samplers.append( NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail, args.neg_sample_size, args.neg_sample_size, True, n_entities)) dataset = None model = load_model(logger, args, n_entities, n_relations) model.share_memory() print('Total initialize time {:.3f} seconds'.format(time.time() - init_time_start)) rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None cross_rels = train_data.cross_rels if args.soft_rel_part else None procs = [] barrier = mp.Barrier(args.num_client) for i in range(args.num_client): proc = mp.Process(target=dist_train_test, args=(args, model, train_samplers[i], entity_partition_book, relation_partition_book, local2global, i, rel_parts, cross_rels, barrier)) procs.append(proc) proc.start() for proc in procs: proc.join()
def profile(run_config: RunConfig): barrier = mp.Barrier(run_config.total_models + 1) profile_data_path = os.path.join(run_config.data_path, "profile") os.makedirs(profile_data_path, exist_ok=True) for model_combination in gen_model_combinations( run_config.models_name, run_config.profiling_combinations, ): print(model_combination) profile_filename = model_combination[0] for model_name in model_combination[1:]: profile_filename = profile_filename + "_" + model_name profile_filename += ".csv" profile_file = open(os.path.join(profile_data_path, profile_filename), "w+") wr = csv.writer(profile_file, dialect="excel") profile_head = [ "model", "start", "end", "bs", "seq_len", ] * run_config.total_models + [ "median", "mean", "var", ] wr.writerow(profile_head) worker_list = [] for worker_id, model_name in enumerate(model_combination): pipe_parent, pipe_child = mp.Pipe() model_worker = ProfilerWorker( run_config, model_name, run_config.supported_batchsize, run_config.supported_seqlen, pipe_child, barrier, worker_id, ) model_worker.start() worker_list.append((model_worker, pipe_parent)) barrier.wait() for bs_it in itertools.product( run_config.supported_batchsize, repeat=run_config.total_models ): model_ids = [i for i in range(run_config.total_models)] profiled_config = set() for test_i in range(run_config.total_test): model_config = [] qos_query_cnt = random.randrange(1, run_config.total_models + 1) new_query_cnt = random.randrange(1, run_config.total_models + 1) qos_ids = random.sample(model_ids, qos_query_cnt) new_ids = random.sample(model_ids, new_query_cnt) for i in range(run_config.total_models): start, end = gen_partition( run_config.models_len[model_combination[i]], True if i in qos_ids else False, True if i in new_ids else False, ) seq_len = ( random.choice(run_config.supported_seqlen) if model_combination[i] == "bert" else 0 ) model_config.append( [model_combination[i], start, end, bs_it[i], seq_len] ) pendding_profile_config = tuple(tuple(i) for i in model_config) if pendding_profile_config in profiled_config: print( "Profiled model config: {}, {}, {}, {}, {},{}, {}, {}, {}, {}".format( model_config[0][0], model_config[0][1], model_config[0][2], model_config[0][3], model_config[0][4], model_config[1][0], model_config[1][1], model_config[1][2], model_config[1][3], model_config[1][4], ) ) else: profiled_config.add(pendding_profile_config) for i in range(run_config.total_models): _, model_pipe = worker_list[i] model_pipe.send( ( model_config[i][0], "prepare", model_config[i][1], model_config[i][2], model_config[i][3], model_config[i][4], ) ) barrier.wait() record = [] with tqdm(range(run_config.test_loop)) as t: for loop_i in t: start_time = datetime.datetime.now() for i in range(run_config.total_models): _, model_pipe = worker_list[i] model_pipe.send( ( model_config[i][0], "forward", model_config[i][1], model_config[i][2], model_config[i][3], model_config[i][4], ) ) # barrier.wait() # start_time = datetime.datetime.now() barrier.wait() elapsed_time_us = ( datetime.datetime.now() - start_time ).microseconds t.set_postfix(elapsed=elapsed_time_us) t.update(1) record.append(elapsed_time_us) profile_record = make_record(model_config, record) wr.writerow(profile_record) profile_file.flush() for i in range(run_config.total_models): _, model_pipe = worker_list[i] model_pipe.send(("none", "terminate", -1, -1, -1, -1)) for worker, _ in worker_list: worker.join()
def main(): args = ArgParser().parse_args() prepare_save_path(args) assert args.dataset == 'wikikg90m' args.neg_sample_size_eval = 1000 set_global_seed(args.seed) init_time_start = time.time() # load dataset and samplers dataset = get_dataset(args.data_path, args.dataset, args.format, args.delimiter, args.data_files, args.has_edge_importance) if args.neg_sample_size_eval < 0: args.neg_sample_size_eval = dataset.n_entities args.batch_size = get_compatible_batch_size(args.batch_size, args.neg_sample_size) args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval, args.neg_sample_size_eval) # We should turn on mix CPU-GPU training for multi-GPU training. if len(args.gpu) > 1: args.mix_cpu_gpu = True if args.num_proc < len(args.gpu): args.num_proc = len(args.gpu) # We need to ensure that the number of processes should match the number of GPUs. if len(args.gpu) > 1 and args.num_proc > 1: assert args.num_proc % len(args.gpu) == 0, \ 'The number of processes needs to be divisible by the number of GPUs' # For multiprocessing training, we need to ensure that training processes are synchronized periodically. if args.num_proc > 1: args.force_sync_interval = 1000 args.eval_filter = not args.no_eval_filter if args.neg_deg_sample_eval: assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges." args.soft_rel_part = args.mix_cpu_gpu and args.rel_part print("To build training dataset") t1 = time.time() train_data = TrainDataset(dataset, args, ranks=args.num_proc, has_importance=args.has_edge_importance) print("Training dataset built, it takes %d seconds" % (time.time() - t1)) # if there is no cross partition relaiton, we fall back to strict_rel_part args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part == False) args.num_workers = 8 # fix num_worker to 8 set_logger(args) with open(os.path.join(args.save_path, args.encoder_model_name), 'w') as f: f.write(args.encoder_model_name) if args.num_proc > 1: train_samplers = [] for i in range(args.num_proc): print("Building training sampler for proc %d" % i) t1 = time.time() # for each GPU, allocate num_proc // num_GPU processes train_sampler_head = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='head', num_workers=args.num_workers, shuffle=True, exclude_positive=False, rank=i) train_sampler_tail = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='tail', num_workers=args.num_workers, shuffle=True, exclude_positive=False, rank=i) train_samplers.append( NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail, args.neg_sample_size, args.neg_sample_size, True, dataset.n_entities, args.has_edge_importance)) print("Training sampler for proc %d created, it takes %s seconds" % (i, time.time() - t1)) train_sampler = NewBidirectionalOneShotIterator( train_sampler_head, train_sampler_tail, args.neg_sample_size, args.neg_sample_size, True, dataset.n_entities, args.has_edge_importance) else: # This is used for debug train_sampler_head = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='head', num_workers=args.num_workers, shuffle=True, exclude_positive=False) train_sampler_tail = train_data.create_sampler( args.batch_size, args.neg_sample_size, args.neg_sample_size, mode='tail', num_workers=args.num_workers, shuffle=True, exclude_positive=False) train_sampler = NewBidirectionalOneShotIterator( train_sampler_head, train_sampler_tail, args.neg_sample_size, args.neg_sample_size, True, dataset.n_entities, args.has_edge_importance) if args.valid or args.test: if len(args.gpu) > 1: args.num_test_proc = args.num_proc if args.num_proc < len( args.gpu) else len(args.gpu) else: args.num_test_proc = args.num_proc print("To create eval_dataset") t1 = time.time() eval_dataset = EvalDataset(dataset, args) print("eval_dataset created, it takes %d seconds" % (time.time() - t1)) if args.valid: if args.num_proc > 1: # valid_sampler_heads = [] valid_sampler_tails = [] for i in range(args.num_proc): print("creating valid sampler for proc %d" % i) t1 = time.time() # valid_sampler_head = eval_dataset.create_sampler('valid', args.batch_size_eval, # args.neg_sample_size_eval, # args.neg_sample_size_eval, # args.eval_filter, # mode='head', # num_workers=args.num_workers, # rank=i, ranks=args.num_proc) valid_sampler_tail = eval_dataset.create_sampler( 'valid', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='tail', num_workers=args.num_workers, rank=i, ranks=args.num_proc) # valid_sampler_heads.append(valid_sampler_head) valid_sampler_tails.append(valid_sampler_tail) print( "Valid sampler for proc %d created, it takes %s seconds" % (i, time.time() - t1)) else: # This is used for debug # valid_sampler_head = eval_dataset.create_sampler('valid', args.batch_size_eval, # args.neg_sample_size_eval, # 1, # args.eval_filter, # mode='head', # num_workers=args.num_workers, # rank=0, ranks=1) valid_sampler_tail = eval_dataset.create_sampler( 'valid', args.batch_size_eval, args.neg_sample_size_eval, 1, args.eval_filter, mode='tail', num_workers=args.num_workers, rank=0, ranks=1) if args.test: if args.num_test_proc > 1: test_sampler_tails = [] # test_sampler_heads = [] for i in range(args.num_test_proc): print("creating test sampler for proc %d" % i) t1 = time.time() # test_sampler_head = eval_dataset.create_sampler('test', args.batch_size_eval, # args.neg_sample_size_eval, # args.neg_sample_size_eval, # args.eval_filter, # mode='head', # num_workers=args.num_workers, # rank=i, ranks=args.num_test_proc) test_sampler_tail = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, args.neg_sample_size_eval, args.eval_filter, mode='tail', num_workers=args.num_workers, rank=i, ranks=args.num_test_proc) # test_sampler_heads.append(test_sampler_head) test_sampler_tails.append(test_sampler_tail) print("Test sampler for proc %d created, it takes %s seconds" % (i, time.time() - t1)) else: # test_sampler_head = eval_dataset.create_sampler('test', args.batch_size_eval, # args.neg_sample_size_eval, # 1, # args.eval_filter, # mode='head', # num_workers=args.num_workers, # rank=0, ranks=1) test_sampler_tail = eval_dataset.create_sampler( 'test', args.batch_size_eval, args.neg_sample_size_eval, 1, args.eval_filter, mode='tail', num_workers=args.num_workers, rank=0, ranks=1) # pdb.set_trace() # load model print("To create model") t1 = time.time() model = load_model(args, dataset.n_entities, dataset.n_relations, dataset.entity_feat.shape[1], dataset.relation_feat.shape[1]) if args.encoder_model_name in ['roberta', 'concat']: model.entity_feat.emb = dataset.entity_feat model.relation_feat.emb = dataset.relation_feat print("Model created, it takes %s seconds" % (time.time() - t1)) model.evaluator = WikiKG90MEvaluator() if args.num_proc > 1 or args.async_update: model.share_memory() emap_file = dataset.emap_fname rmap_file = dataset.rmap_fname # We need to free all memory referenced by dataset. eval_dataset = None dataset = None print('Total initialize time {:.3f} seconds'.format(time.time() - init_time_start)) # train start = time.time() rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None cross_rels = train_data.cross_rels if args.soft_rel_part else None if args.num_proc > 1: procs = [] barrier = mp.Barrier(args.num_proc) for i in range(args.num_proc): # valid_sampler = [valid_sampler_heads[i], valid_sampler_tails[i]] if args.valid else None # test_sampler = [test_sampler_heads[i], test_sampler_tails[i]] if args.test else None valid_sampler = [valid_sampler_tails[i]] if args.valid else None test_sampler = [test_sampler_tails[i]] if args.test else None proc = mp.Process(target=train_mp, args=( args, model, train_samplers[i], valid_sampler, test_sampler, i, rel_parts, cross_rels, barrier, )) procs.append(proc) proc.start() for proc in procs: proc.join() else: valid_samplers = [valid_sampler_tail] if args.valid else None test_samplers = [test_sampler_tail] if args.test else None # valid_samplers = [valid_sampler_head, valid_sampler_tail] if args.valid else None # test_samplers = [test_sampler_head, test_sampler_tail] if args.test else None train(args, model, train_sampler, valid_samplers, test_samplers, rel_parts=rel_parts) print('training takes {} seconds'.format(time.time() - start))
def start_worker(args, logger): """Start kvclient for training """ train_time_start = time.time() server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config) args.machine_id = get_local_machine_id(server_namebook) dataset, entity_partition_book, local2global = get_partition_dataset( args.data_path, args.dataset, args.format, args.machine_id) n_entities = dataset.n_entities n_relations = dataset.n_relations print('Partition %d n_entities: %d' % (args.machine_id, n_entities)) print("Partition %d n_relations: %d" % (args.machine_id, n_relations)) entity_partition_book = F.tensor(entity_partition_book) relation_partition_book = get_long_tail_partition(dataset.n_relations, args.total_machine) relation_partition_book = F.tensor(relation_partition_book) local2global = F.tensor(local2global) relation_partition_book.share_memory_() entity_partition_book.share_memory_() local2global.share_memory_() model = load_model(logger, args, n_entities, n_relations) model.share_memory() # When we generate a batch of negative edges from a set of positive edges, # we first divide the positive edges into chunks and corrupt the edges in a chunk # together. By default, the chunk size is equal to the negative sample size. # Usually, this works well. But we also allow users to specify the chunk size themselves. if args.neg_chunk_size < 0: args.neg_chunk_size = args.neg_sample_size num_workers = NUM_WORKER train_data = TrainDataset(dataset, args, ranks=args.num_client) train_samplers = [] for i in range(args.num_client): train_sampler_head = train_data.create_sampler(args.batch_size, args.neg_sample_size, args.neg_chunk_size, mode='head', num_workers=num_workers, shuffle=True, exclude_positive=False, rank=i) train_sampler_tail = train_data.create_sampler(args.batch_size, args.neg_sample_size, args.neg_chunk_size, mode='tail', num_workers=num_workers, shuffle=True, exclude_positive=False, rank=i) train_samplers.append(NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail, args.neg_chunk_size, args.neg_sample_size, True, n_entities)) dataset = None print('Total data loading time {:.3f} seconds'.format(time.time() - train_time_start)) rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None cross_rels = train_data.cross_rels if args.soft_rel_part else None args.num_thread = NUM_THREAD procs = [] barrier = mp.Barrier(args.num_client) for i in range(args.num_client): proc = mp.Process(target=dist_train_test, args=(args, model, train_samplers[i], entity_partition_book, relation_partition_book, local2global, i, rel_parts, cross_rels, barrier)) procs.append(proc) proc.start() for proc in procs: proc.join()
def train( self, args: Namespace, env_builder: Callable[[], Env], algo: RLAlgo ) -> None: """ Trains the algorithm on the environment given using the argument namespace as parameters. "args" must have the following attributes: { experiment_path (str): The path to save experiment results and models. render (bool): Render the environment. steps_per_episode (Optional[int]): The number of steps in each episode. silent (bool): Will run without standard output from agents. action_mask (Optional[Tuple[bool, ...]]): The action mask to mask or unmask. masked (Optional[bool]): If an action mask is given, should be True if the returned agent actions are already masked. default_action (Optional[Tuple[float, ...]]): If an action mask is given and going from masked -> unmasked, this should be the default values for the actions. decay (float): The gamma decay for the target Q-values. n_steps (int): The number of decay steps. num_agents (int): The number of agents to run concurrently, 0 is single process. model_sync_interval (int): The number of training steps between agent model syncs, if 0, all processes will share the same model. num_prefetch_batches (int): The number of batches to prefetch to the learner in distributed learning. local_batch_size (int): The number of experiences the agent sends at once in distributed learning. vectorized (bool): If the environment is vectorized. recurrent (bool),Make the network recurrent (using LSTM) play (bool): Runs the environment using the model instead of training. exploration (str, ["rnd", "munchausen"]): The type of exploration to use. episodes (int): The number of episodes to play for if playing. er_capacity (int): The alpha value for PER. batch_size (int): The batch size of the training set. training_steps (int): The number of training steps to train for. start_size (int): The size of the replay buffer before training. er_alpha (float): The alpha value for PER. er_beta (float): The alpha value for PER. er_beta_increment (float): The increment of the beta value on each sample for PER. er_epsilon (float): The epsilon value for PER. burn_in_length (int): If recurrent, the number of burn in samples for R2D2. sequence_length (int): If recurrent, the length of the sequence to train on. max_factor (int): If recurrent, factor of max priority to mean priority for R2D2. } Args: args: The namespace of arguments for training. env_builder: The nullary function to create the environment. algo: The algorithm to train. """ logs_path = None save_path = None if args.experiment_path is not None: logs_path = Path(args.experiment_path, "logs") logs_path.mkdir(parents=True, exist_ok=True) logs_path = str(logs_path) save_path = Path(args.experiment_path, "models") save_path.mkdir(parents=True, exist_ok=True) save_path = str(save_path) # Create agent class agent_builder = partial( OffPolicyAgent, algo=algo, render=args.render, silent=args.silent ) steps_per_episode = ( args.steps_per_episode if "steps_per_episode" in args else None ) agent_builder = compose( agent_builder, partial(TimeLimitAgent, max_steps=steps_per_episode) ) if not args.play: # Experience replay # Won't increment in multiple processes to keep it consistent # across actors er_beta_increment = ( args.er_beta_increment if args.num_agents == 0 else 0 ) if args.recurrent: experience_replay_func = partial( TorchR2D2, alpha=args.er_alpha, beta=args.er_beta, beta_increment=er_beta_increment, epsilon=args.er_epsilon, max_factor=args.max_factor ) else: experience_replay_func = partial( TorchPER, alpha=args.er_alpha, beta=args.er_beta, beta_increment=er_beta_increment, epsilon=args.er_epsilon ) if args.num_agents > 0: recv_pipes = [] send_pipes = [] prestart_func = None if args.model_sync_interval == 0: self._start_training(algo, args) algo.share_memory() recv_pipes = [None] * args.num_agents else: prestart_func = partial( self._start_training, algo=algo, args=args ) # Force CPU for now to avoid re-instantiating cuda in # subprocesses algo.device = torch.device("cpu") algo = algo.to(algo.device) for i in range(args.num_agents): param_pipe = mp.Pipe(duplex=False) recv_pipes.append(param_pipe[0]) send_pipes.append(param_pipe[1]) # Just needed to get the error/priority calculations dummy_experience_replay = experience_replay_func(capacity=1) # Must come before the other wrapper since there are infinite # recursion errors # TODO come up with a better way to implement wrappers agent_builder = compose( agent_builder, partial_iterator( QueueAgent, agent_id=(iter(range(args.num_agents)), True), experience_replay=(dummy_experience_replay, False), param_pipe=(iter(recv_pipes), True) ) ) agent_builder = compose( agent_builder, partial(TorchRLAgent, batch_state=not args.vectorized) ) if "action_mask" in args and args.action_mask: # TODO: Will have to add an action mask wrapper later if args.masked: agent_builder = compose( agent_builder, partial( UnmaskedActionAgent, action_mask=args.action_mask, default_action=args.default_action ) ) agent_builder = compose(agent_builder, TorchOffPolicyAgent) if args.recurrent: agent_builder = compose( agent_builder, SequenceInputAgent, TorchRecurrentAgent ) if args.play: algo = algo.to(args.device) algo.eval() agent_logger = ( None if logs_path is None else TensorboardLogger(logs_path + "/play-agent") ) agent = agent_builder(env=env_builder(), logger=agent_logger) agent.play(args.episodes) else: if args.exploration == "rnd": agent_builder = compose(agent_builder, IntrinsicRewardAgent) elif args.exploration == "munchausen": agent_builder = compose( agent_builder, partial(MunchausenAgent, alpha=0.9) ) algo.train() if args.recurrent: agent_builder = compose( agent_builder, partial( ExperienceSequenceAgent, sequence_length=( args.burn_in_length + args.sequence_length ), overlap=args.burn_in_length ) ) experience_replay = experience_replay_func( capacity=args.er_capacity ) base_agent_logs_path = None if logs_path is not None: base_agent_logs_path = logs_path + "/train-agent" # Single process if args.num_agents == 0: self._start_training(algo, args) agent_logger = None if base_agent_logs_path is not None: agent_logger = TensorboardLogger(base_agent_logs_path) agent = agent_builder(env=env_builder(), logger=agent_logger) agent.train( args.episodes, 1, args.discount, args.n_steps, experience_replay, args.batch_size, args.start_size, save_path, args.save_interval ) # Multiple processes else: done_event = mp.Event() # Number of agents + worker + learner queue_barrier = mp.Barrier(args.num_agents + 2) agent_queue = mp.Queue( maxsize=args.num_prefetch_batches * args.num_agents * 4 ) sample_queue = mp.Queue(maxsize=args.num_prefetch_batches) priority_queue = mp.Queue(maxsize=args.num_prefetch_batches) learner_args = (dummy_experience_replay,) learner_train_args = ( algo, done_event, queue_barrier, args.training_steps, sample_queue, priority_queue, send_pipes, args.model_sync_interval, save_path, args.save_interval ) worker = TorchApexWorker() worker_args = ( experience_replay, done_event, queue_barrier, agent_queue, sample_queue, priority_queue, args.batch_size, args.start_size ) agent_builders = [] agent_train_args = [] agent_train_kwargs = [] for i in range(args.num_agents): agent_logger = None if base_agent_logs_path is not None: agent_logs_path = ( base_agent_logs_path + "-" + str(i + 1) ) agent_logger = TensorboardLogger(agent_logs_path) agent_builders.append( partial(agent_builder, logger=agent_logger) ) agent_train_args.append(( 1, args.local_batch_size, args.discount, args.n_steps, agent_queue, queue_barrier )) agent_train_kwargs.append({ "exit_condition": done_event.is_set }) runner = ApexRunner(done_event) runner.start( learner_args, learner_train_args, worker, worker_args, env_builder, agent_builders, agent_train_args, agent_train_kwargs, prestart_func )
v.env = env v.env.canvas = v.canvas v.visualize() gen = env.run(True) gen.send(None) while True: try: gen.send((None, )) except StopIteration: #barrier.wait() torch.save(env.a3c_model.state_dict(), "./tmp/model_%d_%d" % (env.game_no, os.getpid())) #barrier.wait() gen = env.run(True) gen.send(None) if __name__ == '__main__': model = ActorCritic(5, 9, 64) import torch.nn.init as weight_init for name, param in model.named_parameters(): weight_init.normal(param) model.share_memory() barrier = mp.Barrier(2) parallel.start_parallel(DoubleA3CPPOEnv, model, np=2, func=test_without_gui, args=barrier)