def main(): args = parse_arguments() results_dir = os.path.join('results', args.id) os.makedirs(results_dir, exist_ok=True) logger = Logger(results_dir) metrics = { 'steps': [], 'rewards': [], 'Qs': [], 'best_avg_reward': -float('inf') } np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) torch.backends.cudnn.enabled = args.enable_cudnn else: args.device = torch.device('cpu') if args.tensorboard_dir is None: writer = SummaryWriter( os.path.join(results_dir, 'tensorboard', args.game, args.architecture)) else: writer = SummaryWriter( os.path.join(args.tensorboard_dir, args.game, args.architecture)) # Environment env = Env(args) env.train() action_space = env.action_space() # Agent dqn = Agent(args, env) # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory if args.model is not None and not args.evaluate: if not args.memory: raise ValueError( 'Cannot resume training without memory save path. Aborting...') elif not os.path.exists(args.memory): raise ValueError( 'Could not find memory file at {path}. Aborting...'.format( path=args.memory)) mem = load_memory(args.memory, args.disable_bzip_memory) else: mem = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size: if done: state, done = env.reset(), False next_state, _, done = env.step(np.random.randint(0, action_space)) val_mem.append(state, None, None, done) state = next_state T += 1 if args.evaluate: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, 0, dqn, val_mem, metrics, results_dir, evaluate=True) # Test logger.info('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: # Training loop dqn.train() T, done = 0, True accumulate_reward = 0 for T in trange(1, args.T_max + 1): if done: state, done = env.reset(), False writer.add_scalar('Train/Reward', accumulate_reward, T) accumulate_reward = 0 if T % args.replay_frequency == 0: dqn.reset_noise() # Draw a new set of noisy weights action = dqn.act( state) # Choose an action greedily (with noisy weights) next_state, reward, done = env.step(action) # Step accumulate_reward += reward if args.reward_clip > 0: reward = max(min(reward, args.reward_clip), -args.reward_clip) # Clip rewards mem.append(state, action, reward, done) # Append transition to memory # Train and test if T >= args.learn_start: mem.priority_weight = min( mem.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 if T % args.replay_frequency == 0: dqn.learn( mem ) # Train with n-step distributional double-Q learning if T % args.evaluation_interval == 0: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics, results_dir) # Test writer.add_scalar('Eval/Reward', avg_reward, T) writer.add_scalar('Eval/Q', avg_Q, T) logger.info('T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn.train( ) # Set DQN (online network) back to training mode # If memory path provided, save it if args.memory is not None: save_memory(mem, args.memory, args.disable_bzip_memory) # Update target network if T % args.target_update == 0: dqn.update_target_net() # Checkpoint the network if (args.checkpoint_interval != 0) and (T % args.checkpoint_interval == 0): dqn.save(results_dir, 'checkpoint.pth') state = next_state env.close()
action = dqn.act( state) # Choose an action greedily (with noisy weights) next_state, reward, done = env.step(action) # Step if args.reward_clip > 0: reward = max(min(reward, args.reward_clip), -args.reward_clip) # Clip rewards mem.append(state, action, reward, done) # Append transition to memory T += 1 if T % args.log_interval == 0: log('T = ' + str(T) + ' / ' + str(args.T_max)) # Train and test if T >= args.learn_start: mem.priority_weight = min( mem.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 if T % args.replay_frequency == 0: dqn.learn( mem) # Train with n-step distributional double-Q learning if T % args.evaluation_interval == 0: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, T, dqn, val_mem) # Test log('T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn.train() # Set DQN (online network) back to training mode # Update target network
T, done = 0, True for T in tqdm(range(args.T_max)): if done: state, done = env.reset(), False if T % args.replay_frequency == 0: dqn.reset_noise() # Draw a new set of noisy weights action = dqn.act(state) # Choose an action greedily next_state, reward, done = env.step(action) # Step if args.reward_clip > 0: reward = max(min(reward, args.reward_clip), -args.reward_clip) mem.append(state, action, reward, done) # Append transition to memory # Train and test if T >= args.learn_start: mem.priority_weight = min( mem.priority_weight + priority_weight_increase, 1) if T % args.replay_frequency == 0: dqn.learn(mem) if T % args.evaluation_interval == 0: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics, results_dir) # Test log('T = ' + str(T) + ' / ' + str(args.T_max) + '|Avg.R:' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn.train() # Set DQN (online network) back to training mode if T % args.target_update == 0: dqn.update_target_net()
def main(): args = parser.parse_args() print(' ' * 26 + 'Options') for k, v in vars(args).items(): print(' ' * 26 + k + ': ' + str(v)) np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) # Disable nondeterministic ops (not sure if critical but better safe than sorry) #torch.backends.cudnn.enabled = False else: args.device = torch.device('cpu') args.large = False args.skip_frames = 0 args.random_aug = 0. # Environment train_env = create_env(args.environment_filename, custom=True, large=args.large, skip_frames=args.skip_frames, random_aug=args.random_aug, docker=args.docker_training, device=args.device) action_space = train_env.action_space test_env = create_env( args.environment_filename, custom=True, large=args.large, custom_reward=False, skip_frames=args.skip_frames, docker=args.docker_training, device=args.device, worker_id=1, ) mem = ReplayMemory(args, args.memory_capacity, obs_space=train_env.observation_space) val_mem = ReplayMemory(args, args.evaluation_size, obs_space=test_env.observation_space) # for debugging environment issues if args.timeout_monitor: train_env = TimeoutMonitor(train_env, mem) test_env = TimeoutMonitor(test_env, val_mem) # Agent dqn = Agent(args, train_env) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) time_step = 0 done = True state = None while time_step < args.evaluation_size: if done: state = train_env.reset() done = False next_state, _, done, _ = train_env.step(action_space.sample()) val_mem.append(state, None, None, done) state = next_state time_step += 1 if args.evaluate: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True) # Test print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: # Training loop dqn.train() done = True for time_step in tqdm(range(args.T_max)): if done: state = train_env.reset() done = False if time_step % args.replay_frequency == 0: dqn.reset_noise() # Draw a new set of noisy weights action = dqn.act( state) # Choose an action greedily (with noisy weights) next_state, reward, done, info = train_env.step(action) # Step if args.reward_clip > 0: reward = max(min(reward, args.reward_clip), -args.reward_clip) # Clip rewards mem.append(state, action, reward, done) # Append transition to memory # Train and test if time_step >= args.learn_start: # Anneal importance sampling weight β to 1 mem.priority_weight = min( mem.priority_weight + priority_weight_increase, 1) if time_step % args.replay_frequency == 0: dqn.learn( mem ) # Train with n-step distributional double-Q learning if time_step % args.evaluation_interval == 0: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, time_step, dqn, val_mem, env=test_env) # Test log('T = ' + str(time_step) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn.train( ) # Set DQN (online network) back to training mode # Update target network if time_step % args.target_update == 0: dqn.update_target_net() state = next_state train_env.close()
def train_agent(env, args, config): """ Args: """ # create CNN convert the [1,3,84,84] to [1, 200] now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") torch.manual_seed(config["seed"]) np.random.seed(config["seed"]) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) torch.backends.cudnn.enabled = args.enable_cudnn pathname = dt_string + "_seed" + str(config["seed"]) print("save tensorboard {}".format(config["locexp"])) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname agent = Agent(args, env) #agent.load(str(args.locexp), "1/checkpoint-52038.pth") memory = ReplayMemory(args, args.memory_capacity) #memory.load_memory("memory_pacman") #memory = ReplayBuffer((3, config["size"], config["size"]), (1,), config["expert_buffer_size"], int(config["image_pad"]), config["device"]) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) writer = SummaryWriter(tensorboard_name) results_dir = os.path.join(str(config["locexp"]), args.id) mkdir("", results_dir) scores_window = deque(maxlen=100) steps_window = deque(maxlen=100) scores = [] t0 = time.time() # Training loop agent.train() T, done = 0, True print("result dir ", results_dir) agent.save(results_dir, 'checkpoint-{}.pth'.format(T)) #eval_policy(env, agent, writer, T, config) episode = -1 steps = 0 score = 0 print("save policy ", args.checkpoint_interval) # eval_policy(env, agent, writer, 0, config) for T in range(1, args.T_max + 1): # print("\r {} of {}".format(T, args.T_max), end='') if done: episode += 1 # Checkpoint the network if episode % 100 == 0: memory.save_memory("memory_pacman") print("Eval policy") #eval_policy(env, agent, writer, T, config) agent.save(results_dir, 'checkpoint-{}.pth'.format(T)) scores_window.append(score) # save most recent scor scores.append(score) # save most recent score steps_window.append(steps) ave_steps = np.mean(steps_window) print( '\rTime steps {} episode {} score {} Average Score: {:.2f} steps {} ave steps {:.2f} time: {}' .format(T, episode, score, np.mean(scores_window), steps, ave_steps, time_format(time.time() - t0)), end="") writer.add_scalar('Episode_reward ', score, T) average_reward = np.mean(scores_window) writer.add_scalar('Average_reward ', average_reward, T) state, done = env.reset("mediumClassic"), False steps = 0 score = 0 if T % args.replay_frequency == 0: agent.reset_noise() # Draw a new set of noisy weights action = agent.act( state) # Choose an action greedily (with noisy weights) next_state, reward, done, _ = env.step(action) # Step score += reward steps += 1 if steps == 125: done = True memory.append(state, action, reward, done) # Append transition to memory # Train and test if T >= args.learn_start: memory.priority_weight = min( memory.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 if T % args.replay_frequency == 0: agent.learn( memory ) # Train with n-step distributional double-Q learning # Update target network if T % args.target_update == 0: agent.update_target_net() state = next_state
def worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.distributed: args.seed += args.gpu torch.cuda.set_device(args.gpu) args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0 if args.multiprocessing_distributed: args.rank = args.rank * ngpus_per_node + args.gpu torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:8632', world_size=args.world_size, rank=args.rank) else: args.rank = 0 args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available() args.no_cuda_train = not torch.cuda.is_available() args.verbose = args.verbose and (args.rank == 0) env_device = torch.device('cuda', args.gpu) if args.use_cuda_env else torch.device('cpu') train_device = torch.device('cuda', args.gpu) if (args.no_cuda_train == False) else torch.device('cpu') # Setup np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if args.use_cuda_env or (args.no_cuda_train == False): torch.cuda.manual_seed(random.randint(1, 10000)) if train_device.type == 'cuda': print('Train:\n' + cuda_device_str(train_device.index), flush=True) if args.use_openai: test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes, episode_life=False, clip_rewards=False) test_env.reset() else: test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray', device='cpu', rescale=True, clip_rewards=False, episodic_life=False, repeat_prob=0.0, frameskip=4) # Agent dqn = Agent(args, test_env.action_space) # Construct validation memory if args.rank == 0: print(dqn) print('Initializing evaluation memory with {} entries...'.format(args.evaluation_size), end='', flush=True) start_time = time.time() val_mem = initialize_validation(args, train_device) if args.rank == 0: print('complete ({})'.format(format_time(time.time() - start_time)), flush=True) if args.evaluate: if args.rank == 0: eval_start_time = time.time() dqn.eval() # Set DQN (online network) to evaluation mode rewards, lengths, avg_Q = test(args, 0, dqn, val_mem, test_env, train_device) dqn.train() # Set DQN (online network) back to training mode eval_total_time = time.time() - eval_start_time rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards) lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths) print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | ' 'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | ' 'Avg. Q: {:4.4f} | {}' .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax, lstd, avg_Q, format_time(eval_total_time)), flush=True) else: if args.rank == 0: print('Entering main training loop', flush=True) if args.output_filename: csv_file = open(args.output_filename, 'w', newline='') csv_file.write(json.dumps(vars(args))) csv_file.write('\n') csv_writer = csv.writer(csv_file, delimiter=',') csv_writer.writerow(['frames', 'total_time', 'rmean', 'rmedian', 'rstd', 'rmin', 'rmax', 'lmean', 'lmedian', 'lstd', 'lmin', 'lmax']) else: csv_writer, csv_file = None, None if args.plot: from tensorboardX import SummaryWriter current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) for k, v in vars(args).items(): writer.add_text(k, str(v)) # Environment print('Initializing environments...', end='', flush=True) start_time = time.time() if args.use_openai: train_env = create_vectorize_atari_env(args.env_name, args.seed, args.num_ales, episode_life=True, clip_rewards=args.reward_clip, max_frames=args.max_episode_length) observation = torch.from_numpy(train_env.reset()).squeeze(1) else: train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray', device=env_device, rescale=True, clip_rewards=args.reward_clip, episodic_life=True, repeat_prob=0.0) train_env.train() observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).clone().squeeze(-1) if args.rank == 0: print('complete ({})'.format(format_time(time.time() - start_time)), flush=True) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) has_completed = torch.zeros(args.num_ales, device=train_device, dtype=torch.bool) mem = ReplayMemory(args, args.memory_capacity, train_device) mem.reset(observation) priority_weight_increase = (1 - args.priority_weight) / (args.t_max - args.learn_start) state = torch.zeros((args.num_ales, args.history_length, 84, 84), device=mem.device, dtype=torch.float32) state[:, -1] = observation.to(device=mem.device, dtype=torch.float32).div(255.0) num_frames_per_iter = args.num_ales total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) epsilons = np.linspace(args.epsilon_start, args.epsilon_final, math.ceil(args.epsilon_frames / num_frames_per_iter)) epsilon_offset = math.ceil(args.learn_start / num_frames_per_iter) prefetcher = data_prefetcher(args.batch_size, train_device, mem) avg_loss = 'N/A' eval_offset = 0 target_update_offset = 0 total_time = 0 # main loop iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) env_stream = torch.cuda.Stream() train_stream = torch.cuda.Stream() for update in iterator: T = args.world_size * update * num_frames_per_iter epsilon = epsilons[min(update - epsilon_offset, len(epsilons) - 1)] if T >= args.learn_start else epsilons[0] start_time = time.time() if update % args.replay_frequency == 0: dqn.reset_noise() # Draw a new set of noisy weights dqn.eval() nvtx.range_push('train:select action') if args.noisy_linear: action = dqn.act(state) # Choose an action greedily (with noisy weights) else: action = dqn.act_e_greedy(state, epsilon=epsilon) nvtx.range_pop() dqn.train() if args.use_openai: action = action.cpu().numpy() torch.cuda.synchronize() with torch.cuda.stream(env_stream): nvtx.range_push('train:env step') observation, reward, done, info = train_env.step(action) # Step if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation).squeeze(1) reward = torch.from_numpy(reward.astype(np.float32)) done = torch.from_numpy(done.astype(np.bool)) action = torch.from_numpy(action) else: observation = observation.clone().squeeze(-1) nvtx.range_pop() observation = observation.to(device=train_device) reward = reward.to(device=train_device) done = done.to(device=train_device, dtype=torch.bool) action = action.to(device=train_device) observation = observation.float().div_(255.0) not_done = 1.0 - done.float() state[:, :-1].copy_(state[:, 1:].clone()) state *= not_done.view(-1, 1, 1, 1) state[:, -1].copy_(observation) # update episodic reward counters has_completed |= done episode_rewards += reward.float() final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done # Train and test if T >= args.learn_start: mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 prefetcher.preload() avg_loss = 0.0 num_minibatches = min(int(args.num_ales / args.replay_frequency), 8) for _ in range(num_minibatches): # Sample transitions nvtx.range_push('train:sample states') idxs, states, actions, returns, next_states, nonterminals, weights = prefetcher.next() nvtx.range_pop() nvtx.range_push('train:network update') loss = dqn.learn(states, actions, returns, next_states, nonterminals, weights) nvtx.range_pop() nvtx.range_push('train:update priorities') mem.update_priorities(idxs, loss) # Update priorities of sampled transitions nvtx.range_pop() avg_loss += loss.mean().item() avg_loss /= num_minibatches # Update target network if T >= target_update_offset: dqn.update_target_net() target_update_offset += args.target_update torch.cuda.current_stream().wait_stream(env_stream) torch.cuda.current_stream().wait_stream(train_stream) nvtx.range_push('train:append memory') mem.append(observation, action, reward, done) # Append transition to memory nvtx.range_pop() total_time += time.time() - start_time if args.rank == 0: if args.plot and ((update % args.replay_frequency) == 0): writer.add_scalar('train/epsilon', epsilon, T) writer.add_scalar('train/rewards', final_rewards.mean(), T) writer.add_scalar('train/lengths', final_lengths.mean(), T) if T >= eval_offset: eval_start_time = time.time() dqn.eval() # Set DQN (online network) to evaluation mode rewards, lengths, avg_Q = test(args, T, dqn, val_mem, test_env, train_device) dqn.train() # Set DQN (online network) back to training mode eval_total_time = time.time() - eval_start_time eval_offset += args.evaluation_interval rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards) lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths) print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | ' 'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | ' 'Avg. Q: {:4.4f} | {}' .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax, lstd, avg_Q, format_time(eval_total_time)), flush=True) if args.output_filename and csv_writer and csv_file: csv_writer.writerow([T, total_time, rmean, rmedian, rstd, rmin, rmax, lmean, lmedian, lstd, lmin, lmax]) csv_file.flush() if args.plot: writer.add_scalar('eval/rewards', rmean, T) writer.add_scalar('eval/lengths', lmean, T) writer.add_scalar('eval/avg_Q', avg_Q, T) loss_str = '{:4.4f}'.format(avg_loss) if isinstance(avg_loss, float) else avg_loss progress_data = 'T = {:,} epsilon = {:4.2f} avg reward = {:4.2f} loss: {}' \ .format(T, epsilon, final_rewards.mean().item(), loss_str) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() test_env.close()
def train(args, env): action_space = env.action_space.n print("show action space", action_space) print("state space", env.observation_space) # Agent dqn_1 = Agent(args, env) dqn_2 = Agent(args, env) results_dir = os.path.join('results', args.id) print("result dir", results_dir) T, done = 0, True # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory print(" ags training", args.continue_training) args.continue_training = False if args.continue_training: print("Continue Training Load buffer 1 ...") args.memory = results_dir + "/val_mem_1/memory.pkl" mem_1 = load_memory(args.memory, args.disable_bzip_memory) val_mem_1 = ReplayMemory(args, args.evaluation_size) print("loaded memory buffer 1") print("Continue Training Load buffer 2 ...") args.memory = results_dir + "/val_mem_2/memory.pkl" mem_2 = load_memory(args.memory, args.disable_bzip_memory) val_mem_2 = ReplayMemory(args, args.evaluation_size) print("loaded memory buffer 2") else: print("use empty Buffers") args.memory = results_dir + "/val_mem_1/memory.pkl" path = results_dir + "/val_mem_1" print("save memory", args.memory) os.makedirs(path, exist_ok=True) val_mem_1 = ReplayMemory(args, args.evaluation_size) mem_1 = ReplayMemory(args, args.memory_capacity) args.memory = results_dir + "/val_mem_2/memory.pkl" path = results_dir + "/val_mem_2" print("save memory", args.memory) os.makedirs(path, exist_ok=True) val_mem_2 = ReplayMemory(args, args.evaluation_size) mem_2 = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) metrics = { 'steps': [], 'rewards': [], 'Qs': [], 'step_rewards': [], 'train_rewards': [], 'best_avg_reward': -float('inf') } args.continue_training = True def write_into_file(text, file_name='document.csv'): """ """ with open(file_name, 'a', newline='\n') as fd: fd.write(str(text) + "\n") def log(s): text = '[' + str( datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s write_into_file(text) print(text) if torch.cuda.is_available(): print("cuda") def save_memory(memory, memory_path, disable_bzip): if disable_bzip: with open(memory_path, 'wb') as pickle_file: pickle.dump(memory, pickle_file) else: with bz2.open(memory_path, 'wb') as zipped_pickle_file: pickle.dump(memory, zipped_pickle_file) ("Create eval memory of size {} ".format(args.evaluation_size)) # Construct validation memory size = 84 print("Fill eval memory") # fill both memories at same time # use the reward function for each try: while T < args.evaluation_size: T += 1 print("steps ", T) if done: t = 0 done = False state = env.reset() state = torch.tensor(state, dtype=torch.float32, device=args.device).div_(255) zeros = torch.zeros_like(state) state_buffer = deque([], maxlen=args.history_length) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(state) state = torch.stack(list(state_buffer), 0) t += 1 if t == args.max_episode_length: #if t == 5: t = 0 done = True next_state, _, _, _ = env.step(np.random.randint(0, action_space)) val_mem_1.append(state, None, None, done) val_mem_2.append(state, None, None, done) next_state = torch.tensor(next_state, dtype=torch.float32, device=args.device).div_(255) state_buffer.append(next_state) state = torch.stack(list(state_buffer), 0) eps_1 = 1 eps_end_1 = 0.05 eps_decay_1 = 0.999978 # reaches 10% at 105000 eps_2 = 1 eps_end_2 = 0.05 eps_decay_2 = 0.999978 # reaches 10% at 10500 #args.evaluate = True if args.evaluate: print("Test") dqn.eval() # Set DQN (online network) to evaluation mode #avg_reward, avg_Q = test(args, 0, dqn, val_mem, metrics, results_dir, env, evaluate=True) # Test avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics, results_dir, env) # Test print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: if args.continue_training: print("Start Training") T = args.learn_start + 500 # Training loop dqn_1.train() dqn_2.train() episode = 0 episode_reward = 0 mean_reward = deque(maxlen=100) plot_rewards = [] print("Fill both memory buffers ") while T < args.learn_start: if T % args.max_episode_length == 0: state, done = env.reset(), False state = torch.tensor(state, dtype=torch.float32, device=args.device).div_(255) zeros = torch.zeros_like(state) state_buffer = deque([], maxlen=args.history_length) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(state) state = torch.stack(list(state_buffer), 0) # choose action at random action = np.random.randint(0, action_space) next_state, reward, done, reward_2 = env.step(action) # Step text = "Step {} of {} ".format(T, args.learn_start) print(text, end='\r', file=sys.stdout, flush=True) # set done on the last transition if (T + 1) % args.max_episode_length == 0: done = True mem_1.append(state, action, reward, done) mem_2.append(state, action, reward_2, done) next_state = torch.tensor(next_state, dtype=torch.float32, device=args.device).div_(255) state_buffer.append(next_state) state = torch.stack(list(state_buffer), 0) T += 1 if T >= args.learn_start: args.memory = results_dir + "/val_mem_1/memory.pkl" print("save memory 1", args.memory) save_memory(mem_1, args.memory, args.disable_bzip_memory) args.memory = results_dir + "/val_mem_2/memory.pkl" print("save memory 2", args.memory) save_memory(mem_2, args.memory, args.disable_bzip_memory) break print("Start Training") #for T in tqdm.trange(args.learn_start, args.T_max + 1): for T in tqdm.trange(0, args.T_max + 1): if T % args.max_episode_length == 0: mean_reward.append(episode_reward) print("Epiosde: {} Reward: {} Mean Reward: {} Goal1 {}". format(episode, episode_reward, np.mean(mean_reward), env.goal_counter_1)) plot_rewards.append(np.mean(mean_reward)) save_and_plot(T, plot_rewards) episode_reward = 0 episode += 1 state, done = env.reset(), False state = torch.tensor(state, dtype=torch.float32, device=args.device).div_(255) zeros = torch.zeros_like(state) state_buffer = deque([], maxlen=args.history_length) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(state) state = torch.stack(list(state_buffer), 0) g = 0 set_input = True secondTask = False if T % args.replay_frequency == 0: pass #dqn.reset_noise() # Draw a new set of noisy weights """ if env.task_one_complete or secondTask: action = dqn_2.act_e_greedy(state, eps_2) # Choose an action greedily (with noisy weights) secondTask = True else: action = dqn_1.act_e_greedy(state, eps_1) # Choose an action greedily (with noisy weights) """ if set_input: set_input = False g = input("Enter action : ") action = int(g) g = input("Enter steps : ") g = int(g) if g <= 0: set_input = True g -= 1 #print("step : {} action: {} eps: {}".format(T, action, eps)) next_state, reward, done, reward_2 = env.step(action) # Step if args.reward_clip > 0: reward = max(min(reward, args.reward_clip), -args.reward_clip) # Clip rewards reward_2 = max(min(reward_2, args.reward_clip), -args.reward_clip) # Clip rewards if env.task_one_complete or secondTask: episode_reward += reward_2 eps_2 = max(eps_end_2, eps_decay_2 * eps_2) mem_2.priority_weight = min( mem_2.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 else: episode_reward += reward eps_1 = max(eps_end_1, eps_decay_1 * eps_1) mem_1.priority_weight = min( mem_1.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 #print(reward) #print(reward_2) # incase the last action set done to True if T + 1 % args.max_episode_length == 0: done = True mem_1.append(state, action, reward, done) # Append transition to memory mem_2.append(state, action, reward_2, done) # Append transition to memory # Train and test next_state = torch.tensor(next_state, dtype=torch.float32, device=args.device).div_(255) # print("Main shape of next_state", next_state.shape) state_buffer.append(next_state) state = torch.stack(list(state_buffer), 0) continue # print("Main shape of state", state.shape) if T % args.replay_frequency == 0: dqn_1.learn( mem_1 ) # Train with n-step distributional double-Q learning dqn_2.learn( mem_2 ) # Train with n-step distributional double-Q learning if T % args.evaluation_interval == 0: dqn_1.eval() # Set DQN (online network) to evaluation mode print("Eval epsilon 1 {} epsilon 2 {} ".format( eps_1, eps_2)) avg_reward, avg_Q = test(args, T, dqn_1, val_mem_1, metrics, results_dir, env, 1) # Test log('T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn_1.train( ) # Set DQN (online network) back to training mode dqn_2.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, T, dqn_2, val_mem_2, metrics, results_dir, env, 2) # Test log('T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn_2.train( ) # Set DQN (online network) back to training mode # Update target network if T % args.target_update == 0: dqn_1.update_target_net() dqn_2.update_target_net() # checkpoint the network if (args.checkpoint_interval != 0) and (T % args.checkpoint_interval == 0): #print("save memory", args.memory) #save_memory(mem, args.memory, args.disable_bzip_memory) print("epsilon 1: ", eps_1) print("epsilon 2: ", eps_2) print("Save model at ", results_dir) dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T)) dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T)) except KeyboardInterrupt: print("Keybaord error") finally: print("save state....") print("Save model at ", results_dir) dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T)) dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T)) args.memory = results_dir + "/val_mem_1/memory.pkl" print("save memory 1 ...", args.memory) save_memory(mem_1, args.memory, args.disable_bzip_memory) args.memory = results_dir + "/val_mem_2/memory.pkl" print("save memory 2 ...", args.memory) save_memory(mem_2, args.memory, args.disable_bzip_memory) print("Save model at ", results_dir) dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T)) dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T)) print("... done Saving State") sys.exit()