Example #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--name', '-n', type=str)
    args = parser.parse_args()
    experiment_name = args.name

    HYPARAMS = load_json(
        './hyparams/nec_hyparams.json')[experiment_name]['hyparams']
    logger.debug('experiment_name: {} hyparams: {}'.format(
        experiment_name, HYPARAMS))
    # make checkpoint path
    experiment_logdir = 'experiments/{}'.format(experiment_name)
    if not os.path.exists(experiment_logdir):
        os.makedirs(experiment_logdir)

    # write to tensorboard
    tensorboard_logdir = '{}/tensorboard'.format(experiment_logdir)
    if not os.path.exists(tensorboard_logdir):
        os.mkdir(tensorboard_logdir)
    writer = TensorboardLogger(logdir=tensorboard_logdir)

    env = gym.make('CartPole-v0')
    agent = NECAgent(input_dim=env.observation_space.shape[0],
                     encode_dim=32,
                     hidden_dim=64,
                     output_dim=env.action_space.n,
                     capacity=HYPARAMS['capacity'],
                     buffer_size=HYPARAMS['buffer_size'],
                     epsilon_start=HYPARAMS['epsilon_start'],
                     epsilon_end=HYPARAMS['epsilon_end'],
                     decay_factor=HYPARAMS['decay_factor'],
                     lr=HYPARAMS['lr'],
                     p=HYPARAMS['p'],
                     similarity_threshold=HYPARAMS['similarity_threshold'],
                     alpha=HYPARAMS['alpha'])
    global_steps = 0
    for episode in range(HYPARAMS['episodes']):
        state = env.reset()
        counter = 0
        while True:
            n_steps_q = 0
            start_state = state
            # N-steps Q estimate
            for step in range(HYPARAMS['horizon']):
                state_tensor = torch.from_numpy(state).float().unsqueeze(0)
                action_tensor, value_tensor, encoded_state_tensor = agent.epsilon_greedy_infer(
                    state_tensor)
                if step == 0:
                    start_action = action_tensor.item()
                    start_encoded_state = encoded_state_tensor
                # env.render()
                if global_steps > HYPARAMS['warmup_steps']:
                    action = action_tensor.item()
                    agent.epsilon_decay()
                else:
                    action = env.action_space.sample()
                logger.debug(
                    'episode: {} global_steps: {} value: {} action: {} state: {} epsilon: {}'
                    .format(episode, global_steps, value_tensor.item(), action,
                            state, agent.epsilon))
                next_state, reward, done, info = env.step(action)
                counter += 1
                global_steps += 1
                writer.log_training_v2(global_steps, {
                    'train/value': value_tensor.item(),
                })
                n_steps_q += (HYPARAMS['gamma']**step) * reward
                if done:
                    break
                state = next_state
            n_steps_q += (HYPARAMS['gamma']**HYPARAMS['horizon']
                          ) * agent.get_target_n_steps_q().item()
            writer.log_training_v2(global_steps, {
                'sampled/n_steps_q': n_steps_q,
            })
            logger.debug('sample n_steps_q: {}'.format(n_steps_q))
            # append to ReplayBuffer and DND
            agent.remember_to_replay_buffer(start_state, start_action,
                                            n_steps_q)
            agent.remember_to_dnd(start_encoded_state, start_action, n_steps_q)

            if global_steps / HYPARAMS['horizon'] > HYPARAMS['batch_size']:
                agent.replay(batch_size=HYPARAMS['batch_size'])
            if done:
                # update dnd
                writer.log_episode(episode + 1, counter)
                logger.info('episode done! episode: {} score: {}'.format(
                    episode, counter))
                logger.debug('dnd[0] len: {}'.format(len(agent.dnd_list[0])))
                logger.debug('dnd[1] len: {}'.format(len(agent.dnd_list[1])))
                break
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--name',
                        '-n',
                        required=True,
                        type=str,
                        help='name of experiment')
    parser.add_argument('--render', action='store_true', help='render gym')
    args = parser.parse_args()

    experiment_name = args.name
    is_render = args.render

    hyparams = load_json(
        './hyparams/dqn_hyparams.json')[experiment_name]['hyparams']
    # make checkpoint path
    experiment_logdir = 'experiments/{}'.format(experiment_name)
    if not os.path.exists(experiment_logdir):
        os.makedirs(experiment_logdir)

    # hyparameters
    lr = hyparams['lr']
    buffer_size = hyparams['buffer_size']
    gamma = hyparams['gamma']
    epsilon_start = hyparams['epsilon_start']
    epsilon_end = hyparams['epsilon_end']
    decay_factor = hyparams['decay_factor']
    batch_size = hyparams['batch_size']
    replay_freq = hyparams['replay_freq']
    target_update_freq = hyparams['target_update_freq']
    episodes = hyparams['episodes']
    warmup_steps = hyparams['warmup_steps']
    # max_steps = 1e10
    logger.debug('experiment_name: {} hyparams: {}'.format(
        experiment_name, hyparams))

    # write to tensorboard
    tensorboard_logdir = '{}/tensorboard'.format(experiment_logdir)
    if not os.path.exists(tensorboard_logdir):
        os.mkdir(tensorboard_logdir)
    writer = TensorboardLogger(logdir=tensorboard_logdir)

    env = gym.make('CartPole-v0')
    env.reset()
    # logger.debug('observation_space.shape: {}'.format(env.observation_space.shape))
    agent = DQNAgent(buffer_size,
                     writer=writer,
                     input_dim=env.observation_space.shape[0],
                     output_dim=env.action_space.n,
                     gamma=gamma,
                     epsilon_start=epsilon_start,
                     epsilon_end=epsilon_end,
                     decay_factor=decay_factor)

    state, _, _, _ = env.step(
        env.action_space.sample())  # take a random action to start with
    writer.add_graph(
        agent.policy_network,
        torch.tensor([state],
                     dtype=torch.float32))  # add model graph to tensorboard
    # state, reward, done, info = env.step(env.action_space.sample()) # take a random action to start with
    # for i in range(50):
    #     agent.remember(state, reward, env.action_space.sample(), state, False)
    # for i in range(50):
    #     agent.remember(state, reward, env.action_space.sample(), state, True)
    # loss = agent.replay(batch_size=5)
    global_steps = 0
    for episode in range(episodes):
        score = 0.0
        total_loss = 0.0
        env.reset()
        logger.debug('env.reset() episode {} starts!'.format(episode))
        # update target_network
        if episode % target_update_freq == 0:
            # 1. test replay_bufer
            # logger.debug('step: {} number of samples in bufer: {} sample: {}'.format(step, len(agent.replay_buffer), agent.replay_buffer.get_batch(2)))
            agent.update_target_network()
        for step in count():
            if is_render:
                env.render()
            action_tensor, value_tensor = agent.epsilon_greedy_infer(
                torch.tensor([state], dtype=torch.float32))
            target_value_tensor = agent.evaluate_state(
                torch.tensor([state], dtype=torch.float32))  # temp: for debug
            next_state, reward, done, info = env.step(
                action_tensor.item())  # take a random action
            # action = env.action_space.sample()
            # next_state, reward, done, info = env.step(action) # take a random action
            # logger.debug('episode: {} state: {} reward: {} action: {} next_state: {} done: {}'.format(episode, state, reward, action, next_state, done))
            agent.remember(state, reward, action_tensor.item(), next_state,
                           done)
            # 2. test QNetwork
            # logger.debug('state_tensor: {} action_tensor: {} value_tensor: {}'.format(state_tensor, action_tensor, value_tensor))
            # logger.debug('state_tensor: {} action: {} value: {}'.format(state_tensor, action_tensor.item(), value_tensor.item()))
            # print('state: {} reward: {} action_tensor.item(): {} next_state: {} done: {}'.format(state, reward, action_tensor.item(), next_state, done))
            score += reward
            # experience replay
            if global_steps > max(
                    batch_size,
                    warmup_steps) and global_steps % replay_freq == 0:
                loss = agent.replay(batch_size)
                total_loss += loss
                logger.debug(
                    'episode: {} done: {} global_steps: {} loss: {}'.format(
                        episode, done, global_steps, loss))
                writer.log_training(global_steps, loss, agent.lr,
                                    value_tensor.item(),
                                    target_value_tensor.item())
            writer.add_scalar('epsilon', agent.epsilon, global_steps)  # FIXME

            # if global_steps > max(batch_size, warmup_steps) and global_steps % 1000:
            #     writer.log_linear_weights(global_steps, 'encoder.0.weight', agent.policy_network.get_weights()['encoder.0.weight'])
            agent.epsilon_decay()
            state = next_state  # update state manually
            global_steps += 1
            if done:
                logger.info('episode done! episode: {} score: {}'.format(
                    episode, score))
                writer.log_episode(episode, score, total_loss / (step + 1))
                # save checkpoints
                if global_steps > max(batch_size,
                                      warmup_steps) and episode % 100 == 0:
                    agent.save_checkpoint(experiment_logdir)
                break
                # logger.debug('state_tensor: {} action_tensor: {} value_tensor: {}'.format(state_tensor, action_tensor, value_tensor))
                # logger.debug('output: {} state_tensor: {} state: {}'.format(output, state_tensor, state))
                # agent.remember(state, reward, action, next_state, done)

    env.close()