コード例 #1
0
 def __init__(self, env, n_input, n_output):
     self.env = env
     self.epsilon = 1.0
     self.epsilon_decay = 0
     self.net = DRQN(n_input, n_output).to(cf.DEVICE)
     self.tgt_net = DRQN(n_input, n_output).to(cf.DEVICE)
     self.optimizer = torch.optim.Adam(self.net.parameters(),
                                       lr=cf.LEARNING_RATE)
コード例 #2
0
def train():
    env = GameState()

    # num_inputs = env.observation_space.shape[0]
    num_inputs = 3136
    num_actions = 2
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = DRQN(num_inputs, num_actions)
    target_net = DRQN(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    if torch.cuda.is_available():  # put on GPU if CUDA is available
        online_net = online_net.cuda()
        target_net = target_net.cuda()

    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    epsilon = 1.0
    loss = 0
    iteration = 0

    while iteration < 2000000:
        done = False

        action = torch.zeros([2], dtype=torch.float32)
        action[0] = 1
        image_data, reward, done = env.frame_step(action)
        image_data = resize_and_bgr2gray(image_data)
        image_data = image_to_tensor(image_data)
        state = image_data
        state = torch.Tensor(state)
        if torch.cuda.is_available():
            state = state.cuda()

        hidden = None

        while not done:

            action, hidden, action_index = get_action(state, target_net, epsilon, env, hidden)
            image_data, reward, done = env.frame_step(action)
            image_data = resize_and_bgr2gray(image_data)
            image_data = image_to_tensor(image_data)

            next_state = image_data
            next_state = torch.Tensor(next_state)
            if torch.cuda.is_available():
                next_state = next_state.cuda()

            mask = 0 if done else 1
            reward = reward if not done else -1

            memory.push(state, next_state, action_index, reward, mask)

            state = next_state
            
            if iteration > initial_exploration and len(memory) > batch_size:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = DRQN.train_model(online_net, target_net, optimizer, batch)

                if iteration % update_target == 0:
                    print('iteration: {}, update model'.format(iteration))
                    update_target_model(online_net, target_net)

            iteration += 1

            if iteration % 25000 == 0:
                torch.save(online_net, "pretrained_model/current_model_" + str(iteration) + ".pth")

            print('iteration: {}'.format(iteration))
コード例 #3
0
def run():
    model_name = "drqn_pomdp_random"
    env_name = "MineRLNavigateDense-v0"
    seed = 1

    env = gym.make(env_name)
    #env.make_interactive(realtime=False, port=6666)

    device = torch.device("cuda")
    np.random.seed(seed)
    random.seed(seed)
    writer = SummaryWriter('runs/' + env_name + "_" + model_name)

    batch_size = 2
    learning_rate = 1e-3
    memory_size = 50000
    min_epi_num = 1
    target_update_period = 2

    eps_start = 0.1
    eps_end = 0.001
    eps_decay = 0.995
    tau = 1e-2

    random_update = True
    n_step = 4
    max_epi = 10000
    max_epi_len = 10000
    max_epi_step = 30000

    num_channels = 4
    batch_first = False
    policy_net = DRQN(num_channels=4, num_actions=6,
                      batch_first=batch_first).cuda().float()
    target_net = DRQN(num_channels=4, num_actions=6,
                      batch_first=batch_first).cuda().float()
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

    score = 0
    total_score = 0

    epsilon = eps_start

    memory_device = torch.device("cpu")

    memory = EpisodeMemory(random_update=random_update,
                           max_epi_num=20,
                           max_epi_len=max_epi_len,
                           batch_size=batch_size,
                           n_step=n_step)

    for e in range(max_epi):
        state = env.reset()
        obs = converter(env_name,
                        state).to(memory_device)  # obs : [1, 4, 64, 64]
        done = False

        episode_record = EpisodeBuffer()
        hidden = policy_net.init_hidden_state(batch_first=batch_first,
                                              batch_size=batch_size,
                                              training=False)
        for t in range(max_epi_step):
            action_index, hidden = policy_net.sample_action(
                obs.to(device="cuda:0"), epsilon, hidden)
            action = make_6action(env, action_index)
            s_prime, reward, done, info = env.step(action)
            obs_prime = converter(env_name, s_prime).to(memory_device)
            done_mask = 0.0 if done else 1.0

            batch_action = torch.tensor([action_index
                                         ]).unsqueeze(0).to(memory_device)
            batch_reward = torch.tensor([reward
                                         ]).unsqueeze(0).to(memory_device)
            batch_done = torch.tensor([done_mask
                                       ]).unsqueeze(0).to(memory_device)
            episode_record.put([
                obs, batch_action, batch_reward / 10.0, obs_prime, batch_done
            ])
            obs = obs_prime
            score += reward
            total_score += reward

            if len(memory) > min_epi_num:
                train(writer,
                      policy_net,
                      target_net,
                      memory,
                      optimizer,
                      batch_size,
                      gamma=0.99)

                if (t + 1) % target_update_period == 0:
                    for target_param, local_param in zip(
                            target_net.parameters(),
                            policy_net.parameters()):  # <- soft update
                        target_param.data.copy_(tau * local_param.data +
                                                (1.0 - tau) *
                                                target_param.data)

            if done:
                print(f"Score of # {e} episode : {score}")
                break
        memory.put(episode_record)
        epsilon = max(eps_end, epsilon * eps_decay)

        if e % 5:
            torch.save(policy_net, model_name + '.pth')
        writer.add_scalar('Rewards per episodes', score, e)
        score = 0

    writer.close()
    env.close()
コード例 #4
0
ファイル: train.py プロジェクト: zhihanyang2022/drqn
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    # num_inputs = env.observation_space.shape[0]
    num_inputs = 2
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = DRQN(num_inputs, num_actions)
    target_net = DRQN(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0

    for e in range(30000):
        done = False

        score = 0
        state = env.reset()
        state = state_to_partial_observability(state)
        state = torch.Tensor(state).to(device)

        hidden = (torch.Tensor().new_zeros(1, 1, 16),
                  torch.Tensor().new_zeros(1, 1, 16))

        while not done:
            steps += 1

            action, new_hidden = get_action(state, target_net, epsilon, env,
                                            hidden)
            next_state, reward, done, _ = env.step(action)

            next_state = state_to_partial_observability(next_state)
            next_state = torch.Tensor(next_state)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            memory.push(state, next_state, action, reward, mask, hidden)
            hidden = new_hidden

            score += reward
            state = next_state

            if steps > initial_exploration and len(memory) > batch_size:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = DRQN.train_model(online_net, target_net, optimizer,
                                        batch)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        score = score if score == 500.0 else score + 1
        if running_score == 0:
            running_score = score
        else:
            running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
コード例 #5
0
def main():
    current_id = datetime.datetime.today().isoformat(
        "-") + "-" + os.path.splitext(os.path.basename(__file__))[0]
    parser = argparse.ArgumentParser(description='I-Maze with Block obs')
    parser.add_argument(
        "-modelpath",
        type=str,
        help="modelpath without extension(eg .model, .optimizer)")
    parser.add_argument("-vertical",
                        type=int,
                        default=2,
                        help="vertical corridor length")
    parser.add_argument("-horizontal",
                        type=int,
                        default=0,
                        help="horizontal corridor length")
    parser.add_argument("-validation",
                        type=int,
                        default=0,
                        help="validation flag, default:0")
    parser.add_argument("-outdir",
                        type=str,
                        default="log",
                        help="output dir for loggin, default:'log'")
    parser.add_argument("-epsdelta",
                        type=float,
                        default=10**-6,
                        help="delta of epsilon, default:10**-6")
    parser.add_argument("-initexp",
                        type=int,
                        default=10**4,
                        help="initial exproration, default:10**4")
    parser.add_argument("-eps",
                        type=float,
                        default=1.0,
                        help="epsilon, default:1.0")
    parser.add_argument("-lr",
                        type=float,
                        default=k_default_lr,
                        help="epsilon, default:" + str(k_default_lr))
    parser.add_argument("-modeltype",
                        type=str,
                        default=k_default_modeltype,
                        help="ModelType, default:'" + k_default_modeltype +
                        "'")
    parser.add_argument("-batchsize",
                        type=int,
                        default=k_default_replay_batch_size,
                        help="replay batch size, default:" +
                        str(k_default_replay_batch_size))
    parser.add_argument("-updatefreq",
                        type=int,
                        default=k_default_update_freq,
                        help="update frequency, default:" +
                        str(k_default_update_freq))
    parser.add_argument("-gpu",
                        type=int,
                        default=0,
                        help="gpu id, default:0 (cpu is -1)")
    parser.add_argument("-testoutput",
                        type=int,
                        default=0,
                        help="output only at test, default:0")
    parser.add_argument("-y", type=int, default=0, help="OK?, default:0")
    parser.add_argument("-framehistnum",
                        type=int,
                        default=12,
                        help="frame history num, default:12")
    args = parser.parse_args()

    print(args)
    if args.y == 0:
        input("OK?")

    ## Make directory and write setting log
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    with open(os.path.join(args.outdir, current_id + ".args"), "w") as argsf:
        argsf.write(str(args))

    env = I_MazeEnv(horizontal=args.horizontal,
                    vertical=args.vertical,
                    max_step=k_max_step)

    ## Init model
    input_dim = k_ob_shape[0]
    output_dim = len(env.action_set)
    if args.modeltype == "DQN":
        model = DQN(input_dim * args.framehistnum, output_dim)
    elif args.modeltype == "DRQN":
        model = DRQN(input_dim, output_dim)
    elif args.modeltype == "MQN":
        model = MQN(input_dim,
                    output_dim,
                    max_buff_size=args.framehistnum - 1,
                    m=256,
                    e=256)
    elif args.modeltype == "RMQN":
        model = RMQN(input_dim,
                     output_dim,
                     max_buff_size=args.framehistnum - 1,
                     m=256,
                     e=256)
    elif args.modeltype == "FRMQN":
        model = FRMQN(input_dim,
                      output_dim,
                      max_buff_size=args.framehistnum - 1,
                      m=256,
                      e=256)
    else:
        print("not implemented", args.modeltype)
        exit(0)

    ## Use GPU
    if args.gpu >= 0:
        cuda.get_device(args.gpu).use()
        model.to_gpu()

    ## Init agent
    agent = Agent(k_ob_shape,
                  len(env.action_set),
                  args.framehistnum,
                  model,
                  lr=args.lr,
                  eps_delta=args.epsdelta,
                  eps=args.eps,
                  batch_size=args.batchsize)

    if args.modelpath:
        print("load model from ",
              args.modelpath + ".model and " + args.modelpath + ".optimizer")
        agent.load(os.path.expanduser(args.modelpath))

    train_total_step = 0
    if args.validation:
        ## Run validation
        mode = run_mode.validation
        for vertical in [4, 5, 6, 8, 10, 15, 20, 25, 30, 35, 40]:
            env.vertical = vertical
            for _ in range(1):
                run_episode(current_id, args, env, agent, mode, vertical,
                            train_total_step)
        exit(0)

    for episode_id in range(k_max_episode):
        try:
            if args.validation:
                assert (not "!!!")
            else:
                if episode_id % 100 == 0 and episode_id != 0:
                    ## Run test
                    mode = run_mode.test
                    for j in range(10):
                        run_episode(current_id, args, env, agent, mode,
                                    episode_id + j, train_total_step)

                    ## Save model
                    agent.save(
                        os.path.join(args.outdir, current_id + "_episode" +
                                     str(episode_id)))

                ## Run train
                mode = run_mode.train
                train_total_step \
                    = run_episode(current_id, args, env, agent, mode, episode_id, train_total_step)
        except:
            ark = {}
            ark["args"] = vars(args)
            ark["episode_id"] = episode_id
            ark["train_total_step"] = train_total_step
            ark["eps"] = current_eps
            with open(
                    os.path.join(
                        args.outdir, current_id + "_episode" +
                        str(episode_id) + "_ark.json"), "w") as arkf:
                ark_str = json.dumps(ark, indent=4, sort_keys=True)
                arkf.write(ark_str)
            with open(
                    os.path.join(
                        args.outdir, current_id + "_episode" +
                        str(episode_id) + "_dataset.pkl"), "wb") as datasetf:
                pickle.dump(agent.dqn.dataset, datasetf)
            exit(0)
コード例 #6
0
class Agent:
    def __init__(self, env, n_input, n_output):
        self.env = env
        self.epsilon = 1.0
        self.epsilon_decay = 0
        self.net = DRQN(n_input, n_output).to(cf.DEVICE)
        self.tgt_net = DRQN(n_input, n_output).to(cf.DEVICE)
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          lr=cf.LEARNING_RATE)

    def action(self, state, hidden):

        state = state.unsqueeze(0).unsqueeze(0)
        q_value, hidden = self.tgt_net.forward(state, hidden)
        _, action = torch.max(q_value, 2)
        self.epsilon_decay += 1
        self.update_epsilon()
        if np.random.rand() <= self.epsilon:
            return self.env.action_space.sample(), hidden
        else:
            return action.item(), hidden

    def update_epsilon(self):

        if self.epsilon_decay > 1000:

            self.epsilon = max(self.epsilon - 0.00005, 0.02)

    def update_tgt(self):

        self.tgt_net.load_state_dict(self.net.state_dict())

    def train_model(self, batch):
        current_states, rewards, actions, next_states, dones = batch

        states_v = torch.stack(current_states).view(cf.BATCH_SIZE,
                                                    cf.l_sequence,
                                                    self.net.n_input)
        next_states_v = torch.stack(next_states).view(cf.BATCH_SIZE,
                                                      cf.l_sequence,
                                                      self.net.n_input)
        actions_v = torch.stack(actions).view(cf.BATCH_SIZE, cf.l_sequence,
                                              -1).long()
        rewards_v = torch.stack(rewards).view(cf.BATCH_SIZE, cf.l_sequence,
                                              -1).to(cf.DEVICE)
        dones_v = torch.stack(dones).view(cf.BATCH_SIZE, cf.l_sequence,
                                          -1).to(cf.DEVICE)
        state_action_values, _ = self.net(states_v)

        state_action_values = state_action_values.gather(
            2, actions_v.to(cf.DEVICE))
        next_state_values, _ = self.tgt_net(next_states_v)
        next_state_values = next_state_values.max(2, keepdim=True)[0]
        next_state_values = next_state_values.detach()

        expected_state_action_values = dones_v * cf.gamma * next_state_values + rewards_v
        loss = torch.nn.functional.mse_loss(state_action_values,
                                            expected_state_action_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss
コード例 #7
0
def one_hot_encode_obs(obs: int):
    one_hot_repr = np.zeros((env.observation_space_dim, ))
    one_hot_repr[obs] = 1
    return one_hot_repr


np.random.seed(seed)
torch.manual_seed(seed)

num_inputs = env.observation_space_dim
num_actions = env.action_space.n
print('observation size:', num_inputs)
print('action size:', num_actions)

online_net = DRQN(num_inputs, num_actions, use_deeper_net)
target_net = DRQN(num_inputs, num_actions, use_deeper_net)
update_target_model(online_net, target_net)

optimizer = optim.Adam(online_net.parameters(), lr=lr)
# if use_experts is False:
#     writer = SummaryWriter('logs/normal')
# else:
#     writer = SummaryWriter('logs/experts')

online_net.to(device)
target_net.to(device)
online_net.train()
target_net.train()
memory = Memory(replay_memory_capacity, sequence_length)