args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size: if done: state, done = env.reset(), False action = random.randint(0, action_space - 1) if args.env == 'peg1-v0': action = np.array([action // 16, action % 16 // 4, action % 4]) next_state, _, done, _ = env.step(action) else: next_state, _, done = env.step(action) val_mem.append(state, None, None, done) state = next_state T += 1 if args.evaluate: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True) # Test print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: # Training loop dqn.train() T, done = 0, True while T < args.T_max: if done: state, done = env.reset(), False
'online_net': dqn.online_net.state_dict() }, ckptdir / 'last_ckpt.tar') save_mem(mem, ckptdir) log("Checkpoint successfully saved") priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size: if done: state = env.reset() next_state, _, done = env.step(np.random.randint(0, action_space)) val_mem.append(state, -1, 0.0, done) state = next_state T += 1 done = True if args.evaluate: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, 0, dqn, val_mem, metrics, results_dir, evaluate=True) # Test print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: # Training loop dqn.train() T, done = 0, True for T in trange(T_init, args.T_max + 1): if done:
# Agent dqn = Agent(args, env) mem = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size - args.history_length + 1: if done: state, done = env.reset(), False val_mem.preappend() # Set up memory for beginning of episode val_mem.append(state, None, None) state, _, done = env.step(random.randint(0, action_space - 1)) T += 1 # No need to postappend on done in validation memory if args.evaluate: dqn.eval() # Set DQN (policy network) to evaluation mode avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True) # Test print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: # Training loop dqn.train() T, done = 0, True while T < args.T_max: if done: state, done = Variable(env.reset()), False
def train_agent(env, args, config): """ Args: """ # create CNN convert the [1,3,84,84] to [1, 200] now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") torch.manual_seed(config["seed"]) np.random.seed(config["seed"]) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) torch.backends.cudnn.enabled = args.enable_cudnn pathname = dt_string + "_seed" + str(config["seed"]) print("save tensorboard {}".format(config["locexp"])) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname agent = Agent(args, env) memory = ReplayMemory(args, args.memory_capacity) #memory = ReplayBuffer((3, config["size"], config["size"]), (1,), config["expert_buffer_size"], int(config["image_pad"]), config["device"]) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) writer = SummaryWriter(tensorboard_name) results_dir = os.path.join(str(config["locexp"]), args.id) mkdir("", results_dir) scores_window = deque(maxlen=100) scores = [] t0 = time.time() # Training loop agent.train() T, done = 0, True print("result dir ", results_dir) agent.save(results_dir, 'checkpoint-{}.pth'.format(T)) #eval_policy(env, agent, writer, T, config) episode = -1 steps = 0 score = 0 print("save policy ", args.checkpoint_interval) # eval_policy(env, agent, writer, 0, config) for T in range(1, args.T_max + 1): # print("\r {} of {}".format(T, args.T_max), end='') if done: episode += 1 scores_window.append(score) # save most recent scor scores.append(score) # save most recent score print( '\rTime steps {} episode {} score {} Average Score: {:.2f} time: {}' .format(T, episode, score, np.mean(scores_window), time_format(time.time() - t0)), end="") writer.add_scalar('Episode_reward ', score, T) average_reward = np.mean(scores_window) writer.add_scalar('Average_reward ', average_reward, T) state, done = env.reset(), False steps = 0 score = 0 if T % args.replay_frequency == 0: agent.reset_noise() # Draw a new set of noisy weights action = agent.act( state) # Choose an action greedily (with noisy weights) next_state, reward, done, _ = env.step(action) # Step score += reward steps += 1 if steps == 30: done = True memory.append(state, action, reward, done) # Append transition to memory # Train and test if T >= args.learn_start: memory.priority_weight = min( memory.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 if T % args.replay_frequency == 0: agent.learn( memory ) # Train with n-step distributional double-Q learning # Update target network if T % args.target_update == 0: agent.update_target_net() # Checkpoint the network if (args.checkpoint_interval != 0) and (T % args.checkpoint_interval == 0): print("Eval policy") eval_policy(env, agent, writer, T, config) agent.save(results_dir, 'checkpoint-{}.pth'.format(T)) state = next_state
def train(args, env): action_space = env.action_space.n print("show action space", action_space) print("state space", env.observation_space) # Agent dqn_1 = Agent(args, env) dqn_2 = Agent(args, env) results_dir = os.path.join('results', args.id) print("result dir", results_dir) T, done = 0, True # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory print(" ags training", args.continue_training) args.continue_training = False if args.continue_training: print("Continue Training Load buffer 1 ...") args.memory = results_dir + "/val_mem_1/memory.pkl" mem_1 = load_memory(args.memory, args.disable_bzip_memory) val_mem_1 = ReplayMemory(args, args.evaluation_size) print("loaded memory buffer 1") print("Continue Training Load buffer 2 ...") args.memory = results_dir + "/val_mem_2/memory.pkl" mem_2 = load_memory(args.memory, args.disable_bzip_memory) val_mem_2 = ReplayMemory(args, args.evaluation_size) print("loaded memory buffer 2") else: print("use empty Buffers") args.memory = results_dir + "/val_mem_1/memory.pkl" path = results_dir + "/val_mem_1" print("save memory", args.memory) os.makedirs(path, exist_ok=True) val_mem_1 = ReplayMemory(args, args.evaluation_size) mem_1 = ReplayMemory(args, args.memory_capacity) args.memory = results_dir + "/val_mem_2/memory.pkl" path = results_dir + "/val_mem_2" print("save memory", args.memory) os.makedirs(path, exist_ok=True) val_mem_2 = ReplayMemory(args, args.evaluation_size) mem_2 = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) metrics = { 'steps': [], 'rewards': [], 'Qs': [], 'step_rewards': [], 'train_rewards': [], 'best_avg_reward': -float('inf') } args.continue_training = True def write_into_file(text, file_name='document.csv'): """ """ with open(file_name, 'a', newline='\n') as fd: fd.write(str(text) + "\n") def log(s): text = '[' + str( datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s write_into_file(text) print(text) if torch.cuda.is_available(): print("cuda") def save_memory(memory, memory_path, disable_bzip): if disable_bzip: with open(memory_path, 'wb') as pickle_file: pickle.dump(memory, pickle_file) else: with bz2.open(memory_path, 'wb') as zipped_pickle_file: pickle.dump(memory, zipped_pickle_file) ("Create eval memory of size {} ".format(args.evaluation_size)) # Construct validation memory size = 84 print("Fill eval memory") # fill both memories at same time # use the reward function for each try: while T < args.evaluation_size: T += 1 print("steps ", T) if done: t = 0 done = False state = env.reset() state = torch.tensor(state, dtype=torch.float32, device=args.device).div_(255) zeros = torch.zeros_like(state) state_buffer = deque([], maxlen=args.history_length) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(state) state = torch.stack(list(state_buffer), 0) t += 1 if t == args.max_episode_length: #if t == 5: t = 0 done = True next_state, _, _, _ = env.step(np.random.randint(0, action_space)) val_mem_1.append(state, None, None, done) val_mem_2.append(state, None, None, done) next_state = torch.tensor(next_state, dtype=torch.float32, device=args.device).div_(255) state_buffer.append(next_state) state = torch.stack(list(state_buffer), 0) eps_1 = 1 eps_end_1 = 0.05 eps_decay_1 = 0.999978 # reaches 10% at 105000 eps_2 = 1 eps_end_2 = 0.05 eps_decay_2 = 0.999978 # reaches 10% at 10500 #args.evaluate = True if args.evaluate: print("Test") dqn.eval() # Set DQN (online network) to evaluation mode #avg_reward, avg_Q = test(args, 0, dqn, val_mem, metrics, results_dir, env, evaluate=True) # Test avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics, results_dir, env) # Test print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: if args.continue_training: print("Start Training") T = args.learn_start + 500 # Training loop dqn_1.train() dqn_2.train() episode = 0 episode_reward = 0 mean_reward = deque(maxlen=100) plot_rewards = [] print("Fill both memory buffers ") while T < args.learn_start: if T % args.max_episode_length == 0: state, done = env.reset(), False state = torch.tensor(state, dtype=torch.float32, device=args.device).div_(255) zeros = torch.zeros_like(state) state_buffer = deque([], maxlen=args.history_length) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(state) state = torch.stack(list(state_buffer), 0) # choose action at random action = np.random.randint(0, action_space) next_state, reward, done, reward_2 = env.step(action) # Step text = "Step {} of {} ".format(T, args.learn_start) print(text, end='\r', file=sys.stdout, flush=True) # set done on the last transition if (T + 1) % args.max_episode_length == 0: done = True mem_1.append(state, action, reward, done) mem_2.append(state, action, reward_2, done) next_state = torch.tensor(next_state, dtype=torch.float32, device=args.device).div_(255) state_buffer.append(next_state) state = torch.stack(list(state_buffer), 0) T += 1 if T >= args.learn_start: args.memory = results_dir + "/val_mem_1/memory.pkl" print("save memory 1", args.memory) save_memory(mem_1, args.memory, args.disable_bzip_memory) args.memory = results_dir + "/val_mem_2/memory.pkl" print("save memory 2", args.memory) save_memory(mem_2, args.memory, args.disable_bzip_memory) break print("Start Training") #for T in tqdm.trange(args.learn_start, args.T_max + 1): for T in tqdm.trange(0, args.T_max + 1): if T % args.max_episode_length == 0: mean_reward.append(episode_reward) print("Epiosde: {} Reward: {} Mean Reward: {} Goal1 {}". format(episode, episode_reward, np.mean(mean_reward), env.goal_counter_1)) plot_rewards.append(np.mean(mean_reward)) save_and_plot(T, plot_rewards) episode_reward = 0 episode += 1 state, done = env.reset(), False state = torch.tensor(state, dtype=torch.float32, device=args.device).div_(255) zeros = torch.zeros_like(state) state_buffer = deque([], maxlen=args.history_length) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(zeros) state_buffer.append(state) state = torch.stack(list(state_buffer), 0) g = 0 set_input = True secondTask = False if T % args.replay_frequency == 0: pass #dqn.reset_noise() # Draw a new set of noisy weights """ if env.task_one_complete or secondTask: action = dqn_2.act_e_greedy(state, eps_2) # Choose an action greedily (with noisy weights) secondTask = True else: action = dqn_1.act_e_greedy(state, eps_1) # Choose an action greedily (with noisy weights) """ if set_input: set_input = False g = input("Enter action : ") action = int(g) g = input("Enter steps : ") g = int(g) if g <= 0: set_input = True g -= 1 #print("step : {} action: {} eps: {}".format(T, action, eps)) next_state, reward, done, reward_2 = env.step(action) # Step if args.reward_clip > 0: reward = max(min(reward, args.reward_clip), -args.reward_clip) # Clip rewards reward_2 = max(min(reward_2, args.reward_clip), -args.reward_clip) # Clip rewards if env.task_one_complete or secondTask: episode_reward += reward_2 eps_2 = max(eps_end_2, eps_decay_2 * eps_2) mem_2.priority_weight = min( mem_2.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 else: episode_reward += reward eps_1 = max(eps_end_1, eps_decay_1 * eps_1) mem_1.priority_weight = min( mem_1.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 #print(reward) #print(reward_2) # incase the last action set done to True if T + 1 % args.max_episode_length == 0: done = True mem_1.append(state, action, reward, done) # Append transition to memory mem_2.append(state, action, reward_2, done) # Append transition to memory # Train and test next_state = torch.tensor(next_state, dtype=torch.float32, device=args.device).div_(255) # print("Main shape of next_state", next_state.shape) state_buffer.append(next_state) state = torch.stack(list(state_buffer), 0) continue # print("Main shape of state", state.shape) if T % args.replay_frequency == 0: dqn_1.learn( mem_1 ) # Train with n-step distributional double-Q learning dqn_2.learn( mem_2 ) # Train with n-step distributional double-Q learning if T % args.evaluation_interval == 0: dqn_1.eval() # Set DQN (online network) to evaluation mode print("Eval epsilon 1 {} epsilon 2 {} ".format( eps_1, eps_2)) avg_reward, avg_Q = test(args, T, dqn_1, val_mem_1, metrics, results_dir, env, 1) # Test log('T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn_1.train( ) # Set DQN (online network) back to training mode dqn_2.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, T, dqn_2, val_mem_2, metrics, results_dir, env, 2) # Test log('T = ' + str(T) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn_2.train( ) # Set DQN (online network) back to training mode # Update target network if T % args.target_update == 0: dqn_1.update_target_net() dqn_2.update_target_net() # checkpoint the network if (args.checkpoint_interval != 0) and (T % args.checkpoint_interval == 0): #print("save memory", args.memory) #save_memory(mem, args.memory, args.disable_bzip_memory) print("epsilon 1: ", eps_1) print("epsilon 2: ", eps_2) print("Save model at ", results_dir) dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T)) dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T)) except KeyboardInterrupt: print("Keybaord error") finally: print("save state....") print("Save model at ", results_dir) dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T)) dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T)) args.memory = results_dir + "/val_mem_1/memory.pkl" print("save memory 1 ...", args.memory) save_memory(mem_1, args.memory, args.disable_bzip_memory) args.memory = results_dir + "/val_mem_2/memory.pkl" print("save memory 2 ...", args.memory) save_memory(mem_2, args.memory, args.disable_bzip_memory) print("Save model at ", results_dir) dqn_1.save(results_dir, '{}-checkpoint.pth'.format(T)) dqn_2.save(results_dir, '{}-2-checkpoint.pth'.format(T)) print("... done Saving State") sys.exit()