def __init__(self, env, n_input, n_output): self.env = env self.epsilon = 1.0 self.epsilon_decay = 0 self.net = DRQN(n_input, n_output).to(cf.DEVICE) self.tgt_net = DRQN(n_input, n_output).to(cf.DEVICE) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=cf.LEARNING_RATE)
def train(): env = GameState() # num_inputs = env.observation_space.shape[0] num_inputs = 3136 num_actions = 2 print('state size:', num_inputs) print('action size:', num_actions) online_net = DRQN(num_inputs, num_actions) target_net = DRQN(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) if torch.cuda.is_available(): # put on GPU if CUDA is available online_net = online_net.cuda() target_net = target_net.cuda() online_net.train() target_net.train() memory = Memory(replay_memory_capacity) epsilon = 1.0 loss = 0 iteration = 0 while iteration < 2000000: done = False action = torch.zeros([2], dtype=torch.float32) action[0] = 1 image_data, reward, done = env.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = image_data state = torch.Tensor(state) if torch.cuda.is_available(): state = state.cuda() hidden = None while not done: action, hidden, action_index = get_action(state, target_net, epsilon, env, hidden) image_data, reward, done = env.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) next_state = image_data next_state = torch.Tensor(next_state) if torch.cuda.is_available(): next_state = next_state.cuda() mask = 0 if done else 1 reward = reward if not done else -1 memory.push(state, next_state, action_index, reward, mask) state = next_state if iteration > initial_exploration and len(memory) > batch_size: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = DRQN.train_model(online_net, target_net, optimizer, batch) if iteration % update_target == 0: print('iteration: {}, update model'.format(iteration)) update_target_model(online_net, target_net) iteration += 1 if iteration % 25000 == 0: torch.save(online_net, "pretrained_model/current_model_" + str(iteration) + ".pth") print('iteration: {}'.format(iteration))
def run(): model_name = "drqn_pomdp_random" env_name = "MineRLNavigateDense-v0" seed = 1 env = gym.make(env_name) #env.make_interactive(realtime=False, port=6666) device = torch.device("cuda") np.random.seed(seed) random.seed(seed) writer = SummaryWriter('runs/' + env_name + "_" + model_name) batch_size = 2 learning_rate = 1e-3 memory_size = 50000 min_epi_num = 1 target_update_period = 2 eps_start = 0.1 eps_end = 0.001 eps_decay = 0.995 tau = 1e-2 random_update = True n_step = 4 max_epi = 10000 max_epi_len = 10000 max_epi_step = 30000 num_channels = 4 batch_first = False policy_net = DRQN(num_channels=4, num_actions=6, batch_first=batch_first).cuda().float() target_net = DRQN(num_channels=4, num_actions=6, batch_first=batch_first).cuda().float() target_net.load_state_dict(policy_net.state_dict()) optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate) score = 0 total_score = 0 epsilon = eps_start memory_device = torch.device("cpu") memory = EpisodeMemory(random_update=random_update, max_epi_num=20, max_epi_len=max_epi_len, batch_size=batch_size, n_step=n_step) for e in range(max_epi): state = env.reset() obs = converter(env_name, state).to(memory_device) # obs : [1, 4, 64, 64] done = False episode_record = EpisodeBuffer() hidden = policy_net.init_hidden_state(batch_first=batch_first, batch_size=batch_size, training=False) for t in range(max_epi_step): action_index, hidden = policy_net.sample_action( obs.to(device="cuda:0"), epsilon, hidden) action = make_6action(env, action_index) s_prime, reward, done, info = env.step(action) obs_prime = converter(env_name, s_prime).to(memory_device) done_mask = 0.0 if done else 1.0 batch_action = torch.tensor([action_index ]).unsqueeze(0).to(memory_device) batch_reward = torch.tensor([reward ]).unsqueeze(0).to(memory_device) batch_done = torch.tensor([done_mask ]).unsqueeze(0).to(memory_device) episode_record.put([ obs, batch_action, batch_reward / 10.0, obs_prime, batch_done ]) obs = obs_prime score += reward total_score += reward if len(memory) > min_epi_num: train(writer, policy_net, target_net, memory, optimizer, batch_size, gamma=0.99) if (t + 1) % target_update_period == 0: for target_param, local_param in zip( target_net.parameters(), policy_net.parameters()): # <- soft update target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) if done: print(f"Score of # {e} episode : {score}") break memory.put(episode_record) epsilon = max(eps_end, epsilon * eps_decay) if e % 5: torch.save(policy_net, model_name + '.pth') writer.add_scalar('Rewards per episodes', score, e) score = 0 writer.close() env.close()
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) # num_inputs = env.observation_space.shape[0] num_inputs = 2 num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = DRQN(num_inputs, num_actions) target_net = DRQN(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 loss = 0 for e in range(30000): done = False score = 0 state = env.reset() state = state_to_partial_observability(state) state = torch.Tensor(state).to(device) hidden = (torch.Tensor().new_zeros(1, 1, 16), torch.Tensor().new_zeros(1, 1, 16)) while not done: steps += 1 action, new_hidden = get_action(state, target_net, epsilon, env, hidden) next_state, reward, done, _ = env.step(action) next_state = state_to_partial_observability(next_state) next_state = torch.Tensor(next_state) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 memory.push(state, next_state, action, reward, mask, hidden) hidden = new_hidden score += reward state = next_state if steps > initial_exploration and len(memory) > batch_size: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = DRQN.train_model(online_net, target_net, optimizer, batch) if steps % update_target == 0: update_target_model(online_net, target_net) score = score if score == 500.0 else score + 1 if running_score == 0: running_score = score else: running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
def main(): current_id = datetime.datetime.today().isoformat( "-") + "-" + os.path.splitext(os.path.basename(__file__))[0] parser = argparse.ArgumentParser(description='I-Maze with Block obs') parser.add_argument( "-modelpath", type=str, help="modelpath without extension(eg .model, .optimizer)") parser.add_argument("-vertical", type=int, default=2, help="vertical corridor length") parser.add_argument("-horizontal", type=int, default=0, help="horizontal corridor length") parser.add_argument("-validation", type=int, default=0, help="validation flag, default:0") parser.add_argument("-outdir", type=str, default="log", help="output dir for loggin, default:'log'") parser.add_argument("-epsdelta", type=float, default=10**-6, help="delta of epsilon, default:10**-6") parser.add_argument("-initexp", type=int, default=10**4, help="initial exproration, default:10**4") parser.add_argument("-eps", type=float, default=1.0, help="epsilon, default:1.0") parser.add_argument("-lr", type=float, default=k_default_lr, help="epsilon, default:" + str(k_default_lr)) parser.add_argument("-modeltype", type=str, default=k_default_modeltype, help="ModelType, default:'" + k_default_modeltype + "'") parser.add_argument("-batchsize", type=int, default=k_default_replay_batch_size, help="replay batch size, default:" + str(k_default_replay_batch_size)) parser.add_argument("-updatefreq", type=int, default=k_default_update_freq, help="update frequency, default:" + str(k_default_update_freq)) parser.add_argument("-gpu", type=int, default=0, help="gpu id, default:0 (cpu is -1)") parser.add_argument("-testoutput", type=int, default=0, help="output only at test, default:0") parser.add_argument("-y", type=int, default=0, help="OK?, default:0") parser.add_argument("-framehistnum", type=int, default=12, help="frame history num, default:12") args = parser.parse_args() print(args) if args.y == 0: input("OK?") ## Make directory and write setting log if not os.path.exists(args.outdir): os.makedirs(args.outdir) with open(os.path.join(args.outdir, current_id + ".args"), "w") as argsf: argsf.write(str(args)) env = I_MazeEnv(horizontal=args.horizontal, vertical=args.vertical, max_step=k_max_step) ## Init model input_dim = k_ob_shape[0] output_dim = len(env.action_set) if args.modeltype == "DQN": model = DQN(input_dim * args.framehistnum, output_dim) elif args.modeltype == "DRQN": model = DRQN(input_dim, output_dim) elif args.modeltype == "MQN": model = MQN(input_dim, output_dim, max_buff_size=args.framehistnum - 1, m=256, e=256) elif args.modeltype == "RMQN": model = RMQN(input_dim, output_dim, max_buff_size=args.framehistnum - 1, m=256, e=256) elif args.modeltype == "FRMQN": model = FRMQN(input_dim, output_dim, max_buff_size=args.framehistnum - 1, m=256, e=256) else: print("not implemented", args.modeltype) exit(0) ## Use GPU if args.gpu >= 0: cuda.get_device(args.gpu).use() model.to_gpu() ## Init agent agent = Agent(k_ob_shape, len(env.action_set), args.framehistnum, model, lr=args.lr, eps_delta=args.epsdelta, eps=args.eps, batch_size=args.batchsize) if args.modelpath: print("load model from ", args.modelpath + ".model and " + args.modelpath + ".optimizer") agent.load(os.path.expanduser(args.modelpath)) train_total_step = 0 if args.validation: ## Run validation mode = run_mode.validation for vertical in [4, 5, 6, 8, 10, 15, 20, 25, 30, 35, 40]: env.vertical = vertical for _ in range(1): run_episode(current_id, args, env, agent, mode, vertical, train_total_step) exit(0) for episode_id in range(k_max_episode): try: if args.validation: assert (not "!!!") else: if episode_id % 100 == 0 and episode_id != 0: ## Run test mode = run_mode.test for j in range(10): run_episode(current_id, args, env, agent, mode, episode_id + j, train_total_step) ## Save model agent.save( os.path.join(args.outdir, current_id + "_episode" + str(episode_id))) ## Run train mode = run_mode.train train_total_step \ = run_episode(current_id, args, env, agent, mode, episode_id, train_total_step) except: ark = {} ark["args"] = vars(args) ark["episode_id"] = episode_id ark["train_total_step"] = train_total_step ark["eps"] = current_eps with open( os.path.join( args.outdir, current_id + "_episode" + str(episode_id) + "_ark.json"), "w") as arkf: ark_str = json.dumps(ark, indent=4, sort_keys=True) arkf.write(ark_str) with open( os.path.join( args.outdir, current_id + "_episode" + str(episode_id) + "_dataset.pkl"), "wb") as datasetf: pickle.dump(agent.dqn.dataset, datasetf) exit(0)
class Agent: def __init__(self, env, n_input, n_output): self.env = env self.epsilon = 1.0 self.epsilon_decay = 0 self.net = DRQN(n_input, n_output).to(cf.DEVICE) self.tgt_net = DRQN(n_input, n_output).to(cf.DEVICE) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=cf.LEARNING_RATE) def action(self, state, hidden): state = state.unsqueeze(0).unsqueeze(0) q_value, hidden = self.tgt_net.forward(state, hidden) _, action = torch.max(q_value, 2) self.epsilon_decay += 1 self.update_epsilon() if np.random.rand() <= self.epsilon: return self.env.action_space.sample(), hidden else: return action.item(), hidden def update_epsilon(self): if self.epsilon_decay > 1000: self.epsilon = max(self.epsilon - 0.00005, 0.02) def update_tgt(self): self.tgt_net.load_state_dict(self.net.state_dict()) def train_model(self, batch): current_states, rewards, actions, next_states, dones = batch states_v = torch.stack(current_states).view(cf.BATCH_SIZE, cf.l_sequence, self.net.n_input) next_states_v = torch.stack(next_states).view(cf.BATCH_SIZE, cf.l_sequence, self.net.n_input) actions_v = torch.stack(actions).view(cf.BATCH_SIZE, cf.l_sequence, -1).long() rewards_v = torch.stack(rewards).view(cf.BATCH_SIZE, cf.l_sequence, -1).to(cf.DEVICE) dones_v = torch.stack(dones).view(cf.BATCH_SIZE, cf.l_sequence, -1).to(cf.DEVICE) state_action_values, _ = self.net(states_v) state_action_values = state_action_values.gather( 2, actions_v.to(cf.DEVICE)) next_state_values, _ = self.tgt_net(next_states_v) next_state_values = next_state_values.max(2, keepdim=True)[0] next_state_values = next_state_values.detach() expected_state_action_values = dones_v * cf.gamma * next_state_values + rewards_v loss = torch.nn.functional.mse_loss(state_action_values, expected_state_action_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss
def one_hot_encode_obs(obs: int): one_hot_repr = np.zeros((env.observation_space_dim, )) one_hot_repr[obs] = 1 return one_hot_repr np.random.seed(seed) torch.manual_seed(seed) num_inputs = env.observation_space_dim num_actions = env.action_space.n print('observation size:', num_inputs) print('action size:', num_actions) online_net = DRQN(num_inputs, num_actions, use_deeper_net) target_net = DRQN(num_inputs, num_actions, use_deeper_net) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) # if use_experts is False: # writer = SummaryWriter('logs/normal') # else: # writer = SummaryWriter('logs/experts') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity, sequence_length)