def __init__(self, loader): self.dataset = loader.dataset self.collate_fn = loader.collate_fn self.batch_sampler = loader.batch_sampler self.num_workers = loader.num_workers self.pin_memory = loader.pin_memory and torch.cuda.is_available() self.timeout = loader.timeout self.done_event = threading.Event() self.sample_iter = iter(self.batch_sampler) if self.num_workers > 0: self.worker_init_fn = loader.worker_init_fn self.index_queues = [ multiprocessing.Queue() for _ in range(self.num_workers) ] self.worker_queue_idx = 0 self.worker_result_queue = multiprocessing.SimpleQueue() self.batches_outstanding = 0 self.worker_pids_set = False self.shutdown = False self.send_idx = 0 self.rcvd_idx = 0 self.reorder_dict = {} base_seed = torch.LongTensor(1).random_()[0] self.workers = [ multiprocessing.Process( target=_ms_loop, args=(self.dataset, self.index_queues[i], self.worker_result_queue, self.collate_fn, base_seed + i, self.worker_init_fn, i)) for i in range(self.num_workers) ] if self.pin_memory or self.timeout > 0: self.data_queue = queue.Queue() if self.pin_memory: maybe_device_id = torch.cuda.current_device() else: # do not initialize cuda context if not necessary maybe_device_id = None self.worker_manager_thread = threading.Thread( target=_worker_manager_loop, args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory, maybe_device_id)) self.worker_manager_thread.daemon = True self.worker_manager_thread.start() else: self.data_queue = self.worker_result_queue for w in self.workers: w.daemon = True # ensure that the worker exits on process exit w.start() _update_worker_pids(id(self), tuple(w.pid for w in self.workers)) _set_SIGCHLD_handler() self.worker_pids_set = True # prime the prefetch loop for _ in range(2 * self.num_workers): self._put_indices()
shared_average_model.load_state_dict( torch.load(args.model, map_location="cpu")) if args.memory and os.path.isdir(args.memory): memory.load(args.memory) print("Load memory from CheckPoint {}, memory len: {}".format( args.memory, len(memory))) if args.data and os.path.isfile(args.data): T.set(torch.load(args.data)[0]) BEST.set(torch.load(args.data)[1]) scores = torch.load(args.data)[2] m_scores = torch.load(args.data)[3] pre_best = BEST.value() print("Load data from CheckPoint {}, T: {}.BEST: {}".format( args.data, T.value(), BEST.value())) memory_queue = mp.SimpleQueue() model_queue = mp.SimpleQueue() processes = [] p2_list = ["ReiwaThunder", "RHEA_PI", "Toothless", "FalzAI"] if not args.evaluate: # Start training agents for rank in range(1, args.num_processes + 1): model_queue.put( (shared_model.state_dict(), shared_average_model.state_dict())) p2 = p2_list[(rank - 1) % len(p2_list)] p = mp.Process(target=actor, args=(rank, args, T, BEST, memory_queue, model_queue, p2)) p.start() print('Process ' + str(rank) + ' started') processes.append(p)
def train( rank: int, world_size: int, lr: float = 5e-4, batch_size: int = 1000, epochs: int = 500, interval: int = 10, save: int = 100, num_workers: int = 4, num_basis: int = 100, dataset: str = 'datasets', load_dir: Optional[str] = None, load_epoch: Optional[int] = None, coefficient_noise: bool = False, verbose: bool = False, use_zero: bool = False, ): assert 0 < batch_size, "batch_size must be a positive integer." assert 0 < epochs, "epochs must be a positive integer." assert (0 <= interval) and ( interval <= epochs ), "Interval must be a non-negative integer between 0 and epochs." assert (0 <= save) and ( save <= epochs), "Save must be a non-negative integer between 0 and epochs." # setup data distributed parallel training setup_nccl(rank, world_size) # world size is total gpus torch.cuda.set_device(rank) # rank is gpu index # directories if rank == 0: print(f"Loading data from {dataset}...") data_dir = Path(f'/mnt/datahole/daniel/gravflows/{dataset}/train/') val_dir = Path(f'/mnt/datahole/daniel/gravflows/{dataset}/validation/') noise_dir = Path('/mnt/datahole/daniel/gwosc/O1') psd_dir = Path(f"/mnt/datahole/daniel/gravflows/{dataset}/train/PSD/") basis_dir = Path(f'/mnt/datahole/daniel/gravflows/{dataset}/basis/') log_dir = f"{datetime.now().strftime('%b%d_%H-%M-%S')}_{os.uname().nodename}" save_dir = Path('gwpe/model_weights/') experiment_dir = save_dir / log_dir experiment_dir.mkdir(parents=True, exist_ok=True) # config files waveform_params_ini = str(data_dir / 'config_files/parameters.ini') extrinsics_ini = 'gwpe/config_files/extrinsics.ini' static_args_ini = str(data_dir / 'config_files/static_args.ini') # validation # training data # dataset = BasisCoefficientsDataset( # data_dir=data_dir, # basis_dir=basis_dir, # static_args_ini=static_args_ini, # parameters_ini=waveform_params_ini, # ) # dataset = BasisEncoderDataset( # n=num_basis, # data_dir=data_dir, # basis_dir=basis_dir, # static_args_ini=static_args_ini, # intrinsics_ini=waveform_params_ini, # extrinsics_ini=extrinsics_ini, # psd_dir=psd_dir, # ifos=['H1','L1'], # ref_ifo='H1', # downcast=True, # add_noise=True, # coefficient_noise=coefficient_noise, # ) dataset = LFIGWDataset( n=100, data_dir=data_dir, basis_dir=basis_dir, static_args_ini=static_args_ini, data_file='coefficients.npy', intrinsics_ini=waveform_params_ini, extrinsics_ini=extrinsics_ini, psd_dir=psd_dir, ifos=['H1', 'L1'], ref_ifo='H1', downcast=True, add_noise=True, distance_scale=True, time_shift=False, ) sampler = DistributedSampler( dataset, shuffle=False, num_replicas=world_size, rank=rank, seed=rank, ) dataloader = DataLoader( dataset, shuffle=False, num_workers=num_workers, batch_size=batch_size, sampler=sampler, pin_memory=True, persistent_workers=True, prefetch_factor=2, worker_init_fn=dataset._worker_init_fn, collate_fn=dataset._collate_fn, ) # validation data val_dataset = LFIGWDataset( n=100, data_dir=data_dir, basis_dir=basis_dir, static_args_ini=static_args_ini, data_file='coefficients.npy', intrinsics_ini=waveform_params_ini, extrinsics_ini=extrinsics_ini, psd_dir=psd_dir, ifos=['H1', 'L1'], ref_ifo='H1', downcast=True, add_noise=True, coefficient_noise=coefficient_noise, distance_scale=True, time_shift=False, ) # val_dataset = BasisCoefficientsDataset( # data_dir=val_dir, # basis_dir=basis_dir, # static_args_ini=static_args_ini, # parameters_ini=[waveform_params_ini, extrinsics_ini], # coefficient_noise=coefficient_noise, # ) val_sampler = DistributedSampler( val_dataset, shuffle=False, num_replicas=world_size, rank=rank, seed=rank, ) val_loader = DataLoader( val_dataset, shuffle=False, num_workers=num_workers, batch_size=batch_size, sampler=val_sampler, pin_memory=True, prefetch_factor=4, worker_init_fn=val_dataset._worker_init_fn, collate_fn=val_dataset._collate_fn, ) # validation data if interval != 0: # specify indices in validation dataset to validate samples min_idx = val_dataset.parameters.distance.argmin() max_idx = val_dataset.parameters.distance.argmax() median_idx = val_dataset.parameters.loc[ val_dataset.parameters.distance == val_dataset.parameters.distance. quantile(interpolation='nearest')].index[0] if rank == 0: figure_titles = [ 'GW150914', 'Min Distance', f'Median Distance', f'Max Distance' ] # validation ground truths for posterior sampling val_gts = torch.stack([ torch.zeros(len(val_dataset.parameters.columns), dtype=torch.float32), # gw150914 dummy gt torch.tensor(val_dataset.parameters.iloc[min_idx].values, dtype=torch.float32), # rank 1 torch.tensor(val_dataset.parameters.iloc[median_idx].values, dtype=torch.float32), # rank 2 torch.tensor(val_dataset.parameters.iloc[max_idx].values, dtype=torch.float32), # rank 3 ]) with torch.no_grad(): # load data from file manually (rather than using val_dataset._worker_init_fn) val_coefficients = np.load(val_dataset.data_dir / val_dataset.data_file, mmap_mode='c') # generate coefficients on cpu - we want to send this to tensorboard (rank 0) before sending to gpus val_coefficients = torch.cat([ torch.from_numpy( generate_gw150914_context(num_basis, noise_dir, psd_dir, basis_dir, static_args_ini))[None], torch.tensor(val_coefficients[[min_idx, median_idx, max_idx]]), ], dim=0).to(dtype=torch.complex64) # place one of each stacked tensor onto corresponding gpu rank val_context = val_coefficients[ rank] * val_dataset.standardization[:, :num_basis] val_context = val_context.to(device=rank) val_context = torch.cat([val_context.real, val_context.imag], dim=0) val_context = val_context.reshape(val_context.shape[0] * val_context.shape[1])[None] else: figure_titles = None val_gts = None val_coefficients = None # set torch profiling runs # wait = 1 # ignore first batch # warmup = 1 # active = 4 # repeat = 2 # tensorboard if rank == 0: # tb = SummaryWriter(f'gwpe/runs/{log_dir}') queue = mp.SimpleQueue() tb_process = mp.Process(target=tensorboard_writer, args=( queue, f'gwpe/runs/{log_dir}', val_dataset.generator.parameters, val_dataset.generator.latex, static_args_ini, basis_dir, num_basis, val_coefficients, val_gts, figure_titles, )) tb_process.start() # instantiate neural spline coupling flow flow = flows.create_NDE_model( input_dim=14, # we do not predict coalescence time context_dim=4 * num_basis, num_flow_steps=15, base_transform_kwargs={ 'base_transform_type': 'rq-coupling', 'batch_norm': True, 'num_transform_blocks': 10, 'activation': 'elu', }) flow = flow.to(rank) print_peak_memory("Max memory allocated after creating local model", rank) # sync_bn_flow = nn.SyncBatchNorm.convert_sync_batchnorm(flow) flow = DDP(flow, device_ids=[rank], output_device=rank) print_peak_memory("Max memory allocated after creating DDP", rank) if use_zero: #https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html from torch.distributed.optim import ZeroRedundancyOptimizer optimizer = ZeroRedundancyOptimizer( flow.parameters(), optimizer_class=torch.optim.Adam, lr=lr, parameters_as_bucket_view=True, ) # optimizer = torch.optim.Adam(flow.parameters(), lr=lr) else: optimizer = torch.optim.Adam(flow.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) if load_dir is not None and load_epoch is not None: print(f'Loading model from {load_dir} at epoch {load_epoch}.') flow.module.load_state_dict( torch.load(f'gwpe/model_weights/{load_dir}/flow_{load_epoch}.pt', map_location=rank)) optimizer.load_state_dict( torch.load( f'gwpe/model_weights/{load_dir}/optimizer_{load_epoch}.pt', map_location=rank)) if Path(f'gwpe/model_weights/{load_dir}/scheduler_{load_epoch}.pt' ).is_file(): scheduler.load_state_dict( torch.load( f'gwpe/model_weights/{load_dir}/scheduler_{load_epoch}.pt', map_location=rank)) # run training loop flow.train() train_loss = torch.zeros((1, ), device=rank, requires_grad=False) val_loss = torch.zeros((1, ), device=rank, requires_grad=False) disable_pbar = False if verbose and (rank == 0) else True # tqdm progress bar with tqdm(total=len(dataloader) * epochs, disable=disable_pbar, desc=f'[{log_dir}] Training', postfix={'epoch': 0}) as progress: # with torch.profiler.profile( # activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], # schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat), # on_trace_ready=torch.profiler.tensorboard_trace_handler(f'gwpe/runs/{log_dir}'), # record_shapes=True, # with_stack=True # ) as profiler: for epoch in range(1, 1 + epochs): if rank == 0: progress.set_postfix({'epoch': epoch}) progress.set_description(f'[{log_dir}] Training', refresh=True) # let all processes sync up before starting with a new epoch of training flow.train() distributed.barrier() iterator = iter(dataloader) coefficients, parameters = next(iterator) coefficients = coefficients.to(rank, non_blocking=True) parameters = parameters.to(rank, non_blocking=True) complete = False while not complete: optimizer.zero_grad() # if profile: # https://github.com/guyang3532/kineto/blob/readme/tb_plugin/docs/gpu_utilization.md ## WARNING: profiler may not handle async pinned memory transfer properly? # i.e. may not record CPU vs GPU wall times correctly # may be related to reported blocks per SM/achieved occupancy negative bug # this was an open issue for pytorch 1.9 as of july 9 - nightly may fix it # https://github.com/pytorch/kineto/issues/325#issuecomment-869362218 # if (step >= (wait + warmup + active) * repeat): # break # negative log-likelihood conditional on strain over mini-batch loss = -flow.module.log_prob(parameters, context=coefficients).mean() try: # async get data from CPU and move to GPU during model forward coefficients, parameters = next(iterator) coefficients = coefficients.to(rank, non_blocking=True) parameters = parameters.to(rank, non_blocking=True) except StopIteration: # exit while loop if iterator is complete complete = True loss.backward() print_peak_memory( "Max memory allocated before optimizer step()", rank) optimizer.step() print_peak_memory( "Max memory allocated after optimizer step()", rank) # if profile: profiler.step() # total loss summed over each sample in batch train_loss += loss.detach() * coefficients.shape[0] if rank == 0: progress.update(1) scheduler.step() # gather total loss during epoch between each GPU worker as list of tensors world_loss = [ torch.ones_like(train_loss) for _ in range(world_size) ] distributed.all_gather(world_loss, train_loss) train_loss *= 0.0 # reset loss for next epoch if (interval != 0) and (epoch % interval == 0): # evaluate model on validation dataset flow.eval() with torch.no_grad(): iterator = iter(enumerate(val_loader)) step, (coefficients, parameters) = next(iterator) coefficients = coefficients.to(rank, non_blocking=True) parameters = parameters.to(rank, non_blocking=True) if rank == 0: val_progress = int(100 * step / len(val_loader)) progress.set_description( f'[{log_dir}] Validating ({val_progress}%)', refresh=True) complete = False while not complete: # negative log-likelihood conditional on strain over mini-batch loss = -flow.module.log_prob( parameters, context=coefficients).mean() try: # async get data from CPU and move to GPU during model forward step, (coefficients, parameters) = next(iterator) coefficients = coefficients.to(rank, non_blocking=True) parameters = parameters.to(rank, non_blocking=True) if rank == 0: val_progress = int(100 * step / len(val_loader)) progress.set_description( f'[{log_dir}] Validating ({val_progress}%)', refresh=True) except StopIteration: # exit while loop if iterator is complete complete = True # total loss summed over each sample in batch val_loss += loss.detach() * coefficients.shape[0] # gather total loss during epoch between each GPU worker as list of tensors world_val_loss = [ torch.ones_like(val_loss) for _ in range(world_size) ] distributed.all_gather(world_val_loss, val_loss) val_loss *= 0.0 # reset loss for next epoch # validation posteriors if rank == 0: progress.set_description( f'[{log_dir}] Sampling posteriors', refresh=True) samples = flows.sample_flow( flow.module, n=10000, context=val_context, output_device='cuda', dtype=torch.float32, )[0] # gather samples from all gpus world_samples = [ torch.ones_like(samples) for _ in range(world_size) ] distributed.all_gather(world_samples, samples) if (rank == 0): progress.set_description(f'[{log_dir}] Sending to TensorBoard', refresh=True) scalars = { 'loss/train': torch.cat(world_loss).sum().item() / len(dataloader.dataset) } # every "interval" we generate samples for vis, else None corner_samples = None # reset to None for epochs where there is no corner plot if (interval != 0) and (epoch % interval == 0): scalars['loss/validation'] = torch.cat( world_val_loss).sum().item() / len(val_loader.dataset) # convert gw150914 samples to cpu and undo standardization corner_samples = torch.stack(world_samples).cpu() corner_samples *= torch.from_numpy(val_dataset.std) corner_samples += torch.from_numpy(val_dataset.mean) # send data to async process to generate matplotlib figures queue.put((epoch, scalars, corner_samples)) if (save != 0) and (epoch % save == 0): # save checkpoint and write computationally expensive data to tb torch.save(flow.module.state_dict(), experiment_dir / f'flow_{epoch}.pt') # if use_zero: # # needs to be called on all ranks # optimizer.consolidate_state_dict(to=0) torch.save(optimizer.state_dict(), experiment_dir / f'optimizer_{epoch}.pt') if scheduler is not None: torch.save(scheduler.state_dict(), experiment_dir / f'scheduler_{epoch}.pt') # destroy processes from distributed training if rank == 0: # to do - graceful way to shutdown workers # need to send message back from child process sleep_time = 120 for i in range(sleep_time): progress.set_description( f'[{log_dir}] Shutting down in {sleep_time - i}s', refresh=True) time.sleep(1) tb_process.terminate() cleanup_nccl()
def train_process(p_id, word_count_actual, word2idx, word_list, freq, args, model, word2morph, word2morph_mask, ctx2morph, ctx2morph_mask): data_queue = mp.SimpleQueue() if args.opt == "Adagrad": optimizer = optim.Adagrad(model.parameters(), lr=args.lr) elif args.opt == "SGD": optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) elif args.opt == 'SparseAdam': optimizer = optim.SparseAdam(model.parameters(), lr=args.lr) t = mp.Process(target=train_process_sent_producer, args=(p_id, data_queue, word_count_actual, word2idx, word_list, freq, args)) t.start() # get from data_queue and feed to model prev_word_cnt = 0 losses_cnt = 0 total_loss = 0.0 losses_file = open(args.losslog, 'w') lr = args.lr #mattrum_cnt = 0 #non_mattrum_cnt = 0 while True: d = data_queue.get() if d is None: break else: # lr anneal if args.anneal: if word_count_actual.value - prev_word_cnt > 10000: lr = args.lr * (1 - word_count_actual.value / (args.iter * args.train_words)) if lr < 0.0001 * args.lr: lr = 0.0001 * args.lr for param_group in optimizer.param_groups: param_group['lr'] = lr else: lr = args.lr if args.cuda: data = Variable(torch.LongTensor(d).cuda(), requires_grad=False) else: data = Variable(torch.LongTensor(d), requires_grad=False) if args.cbow == 1: optimizer.zero_grad() loss = model(data) loss.backward() optimizer.step() model.emb0_lookup.weight.data[args.vocab_size].fill_(0) elif args.cbow == 0: optimizer.zero_grad() #print("WORD") #print(data[3][0]) loss = model(data, word2morph[data[:, 0]], word2morph_mask[data[:, 0]], ctx2morph[data[:, 1:2 + args.negative]], ctx2morph_mask[data[:, 1:2 + args.negative]]) loss.backward() #model.emb0morph_lookup.weight.data.grad[args.morph_size+1].fill_(0) optimizer.step() #model.emb0morph_lookup.weight.data[args.morph_size+1].zero_() losses_cnt += data.shape[0] total_loss += loss # output if word_count_actual.value - prev_word_cnt > 10000: avg_loss = total_loss / losses_cnt sys.stdout.write( "\rAlpha: %0.8f, Loss: %0.8f, Progress: %0.2f, Words/sec: %f" % (lr, avg_loss, word_count_actual.value / (args.iter * args.train_words) * 100, word_count_actual.value / (time.monotonic() - args.t_start))) sys.stdout.flush() prev_word_cnt = word_count_actual.value losses_cnt = 0 total_loss = 0.0 losses_file.write(str(avg_loss.item()) + '\n') losses_file.close() t.join()
def batch_training(fileprefix='', tasks=[]): if fileprefix: filename = '{}-main.out'.format(fileprefix) filepath = pathlib.Path(filename).resolve() if not filepath.parent.exists(): filepath.parent.mkdir(parents=True) stdout_target = filepath.open('wt') else: stdout_target = sys.__stdout__ with contextlib.redirect_stdout(stdout_target): print('System-wide logical CPUs:', psutil.cpu_count()) print('System-wide physical CPUs:', psutil.cpu_count(logical=False)) oversubscribe = 2 ngpus = torch.cuda.device_count() nworkers = ngpus * oversubscribe curproc = psutil.Process() createtime = curproc.create_time() print('Main process {} on CPU {} with {} threads'. format(curproc.pid, curproc.cpu_num(), curproc.num_threads())) print('Presently available CPUs:', len(curproc.cpu_affinity())) print('Presently available GPUs:', ngpus) print('Worker processes:', nworkers) # load input tasks into queue task_queue = mp.SimpleQueue() for i,task in enumerate(tasks): print('Task',i+1,task) task_queue.put(task) # worker locks locks = [] active_processes = [] for i in range(nworkers): locks.append(mp.Lock()) active_processes.append(None) # results queue result_queue = mp.SimpleQueue() itask = 0 while not task_queue.empty(): for ilock,lock in enumerate(locks): if lock.acquire(timeout=1): # acquire lock and expect process == None assert(active_processes[ilock] is None) if task_queue.empty(): lock.release() continue train_kwargs = task_queue.get() igpu = ilock%ngpus args = (itask, ilock, igpu, fileprefix, train_kwargs, result_queue) p = mp.Process(target=gpu_worker, args=args) print(' Launching task {}/{} on worker {} on GPU {}'. format(itask, len(tasks), ilock, igpu)) itask += 1 p.start() active_processes[ilock] = p else: # locked and expect process != None existing_process = active_processes[ilock] assert(existing_process is not None) if existing_process.exitcode is not None: # process is complete; close and release print(' Process {} finished'.format(existing_process.pid)) active_processes[ilock] = None lock.release() print('Finished task loop') still_running = True while still_running: still_running = False for i,process in enumerate(active_processes): if process is None: continue if process.exitcode is None: still_running = True break else: print(' Process {} finished'.format(process.pid)) active_processes[i] = None time.sleep(1) results = [] while not result_queue.empty(): results.append(result_queue.get()) print('Tasks:', len(tasks), 'results:', len(results)) def sort_func(element): return element[0] results = sorted(results, key=sort_func) for i,result in enumerate(results): print('Task {:3d} worker/GPU {:2d}/{:1d} dt {:5.1f}s max/med acc {:5.1f}%/{:5.1f}% kw: {}'. format(*result[0:4], result[4].max(), np.median(result[4]), result[6])) delta_seconds = time.time() - createtime print('Main execution: {:.1f} s'.format(delta_seconds))
print("load training indicator") last_updated = 0 last_deliver = 0 last_saved = 0 test_t = 0 if args.cuda: global_ac.to(device) global_ac_targ.to(device) if args.cpc: global_cpc.to(device) for p in global_ac_targ.parameters(): p.requires_grad = False buffer_q = mp.SimpleQueue() model_q = [mp.SimpleQueue() for _ in range(args.n_process + args.opp_num) ] # + n opp test process evaluation_queue = list() processes = [] # Process n for evaluation for rank in range(args.n_process + args.opp_num): # + n opp test process # Test during training if rank < args.opp_num: # test processes p = mp.Process(target=test_func, args=(rank, E, T, args, model_q[rank], torch.device("cpu"), tensorboard_dir)) else: # actor processes model_q[rank].put(shared_ac.state_dict())
axes[i_l, i_step].set_ylabel(l_names[i_l]) if i_l == 0: axes[i_l, i_step].set_title(f"it {i_step}") if __name__ == "__main__": mp.set_start_method( "fork" ) #fork is unix default and means child process inherits all resources from parent # process. in case problems occur, might use "forkserver" #create global network and pipeline g_net = DRRLnet(INP_W, INP_H, N_ACT, **NET_CONFIG) # global network g_net.zero_grad() g_net.share_memory( ) # share the global parameters in multiprocessing #todo: check whether this makes a difference stats_queue = mp.SimpleQueue( ) #statistics about the episodes will be returned in this queue grads_queue = mp.SimpleQueue( ) #the calculated gradients will be returned as dicts in this queue start_cond = mp.Event( ) #condition object to signal processes to perform another iteration # iteration # so worker process needs to be still alive when queue is accessed) if config["optimizer"] == "RMSprop": #RMSprop optimizer was used for the large state space, not the small ones and impala instead of a3c. # "Learning rate was tuned between 1e-5 and 2e-4" probably means they did hyperparameter search. # scheduling is also possible conveniently using torch torch.optim.lr_scheduler # perhaps use smaller decay term 0.9 optimizer = torch.optim.RMSprop(g_net.parameters(), eps=0.1, lr=config["lr"]) else: #Adam optimizer was used for the starcraft games with learning rate decaying linearly over 1e10 steps from
from runner import Runner if __name__ == "__main__": N_WORKERS = 1 agent = Agent() if len(sys.argv) > 1: saveFile = sys.argv[1] print(f'Training agent from checkpoint: {saveFile}') checkpoint = torch.load(saveFile) agent.load_state_dict(checkpoint["model_state_dict"], strict=True) #agent.eval() #success = agent.load_state_dict(torch.load(saveFile)) #print(f'Loading returned: {success}') #agent.eval() directory = './videos/car-racing/fromCheckpoint' + str(time.time()) player = Player(agent=agent, directory=directory, train=True) #points = player.play() #print(f'loaded agent scored {points} Points') trainer = Trainer(gamma=0.99, agent=agent, workers=N_WORKERS) trainer.optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: print('Training new agent') trainer = Trainer(gamma=0.99, agent=deepcopy(agent), workers=N_WORKERS) queue = mp.SimpleQueue() runner = Runner(agent=agent, ix=0) trainer.train_one(runner, queue)