Ejemplo n.º 1
0
    def __init__(self,
                 env_name,
                 global_actor_critic,
                 lock,
                 eps=0.5,
                 anneal_rate=0.99,
                 t_max=10,
                 gamma=0.99,
                 lr=0.0005):
        Thread.__init__(self)
        self.env = gym.make(env_name)
        self.env_name = env_name
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n
        self.lock = lock
        self.eps = eps
        self.anneal_rate = anneal_rate
        self.global_actor_critic = global_actor_critic
        self.actor_critic = ActorCritic(self.state_dim, self.action_dim)
        self.t_max = t_max
        self.gamma = gamma

        self.optimizer = tf.keras.optimizers.Adam(lr)
        log_dir = 'logs'
        self.summary_writer = tf.summary.create_file_writer(log_dir)
Ejemplo n.º 2
0
    def __init__(self):
        self.run_epochs = 0
        self.epochs_total = 0
        self.hybrid_loss_cumulative = []
        self.critic_loss_cumulative = []
        self.critic_target_loss_cumulative = []
        self.actor_loss_cumulative = []
        self.scores_cumulative = []
        self.critic_scores_cumulative = []
        self.actor_scores_cumulative = []
        self.winratio_cumulative = []
        self.epsilon_cumulative = []
        self.epsilon = 0.9
        self.last_lr_change = 0

        e = Map(self.grid_size[0], self.grid_size[1])
        e.USE_MAZE = self.use_maze
        e.curriculum = self.curriculum  # distance from goal player spawns at most
        self.environment = e
        self.action_count = e.action_space.n
        self.action_shape = (self.action_count, )
        self.buffer = ReplayBuffer(self.buffer_size)
        num_rewards = len(e.hybrid_rewards())
        self.actor_critic = ActorCritic(self.input_shape, self.action_shape,
                                        num_rewards)
        self.actor_critic_target = ActorCritic(self.input_shape,
                                               self.action_shape, num_rewards)
        self.possible_actions = np.eye(e.action_space.n)[np.arange(
            e.action_space.n)]
Ejemplo n.º 3
0
 def __init__(self, env, GAMMA=0.5):
     self.env = env
     self.states_dim = self.env.observation_space.shape[0]
     self.action_dim = self.env.action_space.shape[0]
     self.actor_critic = ActorCritic(self.states_dim,
                                     self.action_dim,
                                     lr=0.0000000001)
     self.all_observations = np.asarray([])
 def __init__(self, env, GOAL_STATE, GAMMA=0.95, lr=0.001):
     self.env = env
     self.GOAL_STATE = GOAL_STATE
     self.states_dim = self.env.observation_space.shape[0]
     self.action_dim = self.env.action_space.shape[0]
     self.actor_critic = ActorCritic(
         self.states_dim, self.action_dim, GAMMA=GAMMA, lr=lr)
     self.min_spread_holder = MinSpreadHolder(self.states_dim)
Ejemplo n.º 5
0
def learn(logger, episodes, render):
    env = Env()
    actor = ActorCritic(env, DISCOUNT)

    lr_policy = LearningRate(MAX_LEARNING_RATE_POLICY, MIN_LEARNING_RATE_POLICY, episodes)
    lr_value = LearningRate(MAX_LEARNING_RATE_VALUE, MIN_LEARNING_RATE_VALUE, episodes)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver()

        scores = []
        for episode in range(episodes):
            lrp = lr_policy.get_lr(episode)
            lrv = lr_value.get_lr(episode)

            score = learn_episode(env, actor, lrp, lrv, render, sess)
            scores.append(score)

            mean = np.mean(scores[-STEPS_TO_WIN:])

            args = [episode, episodes, score, mean, lrp, lrv]
            logger.info('After {}/{} episodes: {}, mean: {:.2f}, lrp: {:.6f}, lrv: {:.6f}'.format(*args))

        save_path = saver.save(sess, os.path.join(os.getcwd(), 'model.ckpt'))
        logger.info('Model saved to {}'.format(save_path))
Ejemplo n.º 6
0
    def __init__(self,
                 state_dim,
                 action_bound=1.0,
                 final_activation=tf.identity,
                 training_batch_size=32,
                 GAMMA=0.95,
                 lr=0.001,
                 replay_buffer_size=1024):

        self.AC = ActorCritic(state_dim,
                              state_dim,
                              final_activation=final_activation,
                              action_bound=action_bound,
                              training_batch_size=training_batch_size,
                              GAMMA=GAMMA,
                              lr=lr,
                              replay_buffer_size=replay_buffer_size)
Ejemplo n.º 7
0
class GoalController(object):
    def __init__(self,
                 state_dim,
                 action_bound=1.0,
                 final_activation=tf.identity,
                 training_batch_size=32,
                 GAMMA=0.95,
                 lr=0.001,
                 replay_buffer_size=1024):

        self.AC = ActorCritic(state_dim,
                              state_dim,
                              final_activation=final_activation,
                              action_bound=action_bound,
                              training_batch_size=training_batch_size,
                              GAMMA=GAMMA,
                              lr=lr,
                              replay_buffer_size=replay_buffer_size)

    def add_to_replay_buffer(self, state, goal_state, reward, resulting_state):
        # Here, reward means exactly what it sounds like it does...
        self.AC.add_to_replay_buffer(state, goal_state, reward,
                                     resulting_state)

    def add_batch_to_replay_buffer(self, states, goal_states, rewards,
                                   resulting_states):
        for s, gs, r, rs in zip(states, goal_states, rewards,
                                resulting_states):
            self.AC.add_to_replay_buffer(s, gs, r, rs)

    def train_from_replay_buffer(self):
        self.AC.train_from_replay_buffer()

    def get_goal_state(self, current_states):
        return self.AC.get_actions(current_states)
Ejemplo n.º 8
0
def pd_test(env_fn, policy, load_path):

    env = env_fn()
    actions = env.unwrapped.action_list
    env._seed(int(time.time()))

    obs = env.reset()
    obs = np.expand_dims(obs, axis=0)

    action_list = []

    with tf.Session() as sess:

        actor_critic = ActorCritic(sess, policy, env.observation_space.shape,
                                   env.action_space, 1, 5)

        if load_path:
            actor_critic.load(load_path)
        else:
            sess.run(tf.global_variables_initializer())
            print('WARNING: No Model Loaded!')

        print(env.unwrapped.scramble_current)
        d = False
        while not d:
            print('-------------------------------------------------')
            print('Current Observation')
            env.render()

            a, v, neg = actor_critic.act(obs, stochastic=True)
            print('')
            print('action: ', actions[a[0]])
            print('value: ', v)
            print('neglogp: ', neg)
            print('pd: ')
            for ac, pd in zip(actions,
                              actor_critic.step_model.logits(obs)[0][0]):
                print('\t', ac, pd)

            obs, r, d, _ = env.step(a[0])
            print('r: ', r)
            obs = np.expand_dims(obs, axis=0)
        env.render()

    env.close()
Ejemplo n.º 9
0
 def setup_agents(self):
     agents = []
     for i in range(self.n_agents):
         model = ActorCritic(n_agents=self.n_agents,
                             state_size=self.state_size,
                             action_size=self.action_size,
                             seed=self.random_seed)
         agents.append(DDPG(i, model, self.action_size, self.random_seed))
     return agents
Ejemplo n.º 10
0
    def __init__(self, device, state_dim, action_dim, action_std, lr, betas,
                 gamma, K_epochs, eps_clip):
        self.lr = lr
        self.device = device
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        #self.optimizer = RAdam(self.policy.parameters(), lr=lr, betas=betas)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)

        self.policy_old = ActorCritic(state_dim, action_dim,
                                      action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()
    def __init__(self,
                 state_dim,
                 action_dim,
                 action_bound=0.4,
                 training_batch_size=32,
                 GAMMA=0.95,
                 lr=0.001,
                 replay_buffer_size=1024):

        self.state_dim = state_dim

        self.AC = ActorCritic(new_state_dim,
                              action_dim,
                              action_bound=action_bound,
                              training_batch_size=training_batch_size,
                              GAMMA=GAMMA,
                              lr=lr,
                              replay_buffer_size=replay_buffer_size)
Ejemplo n.º 12
0
def run_tests():
    """ Runs tests from .yaml file, saves results plots and .csv file.

        Args:
            None.

        Returns:
            results: Test results dataframe. 

    """
    with open(FILENAME) as file:

        # Loads the test hyper-parameters as dictionaries.
        tests = yaml.safe_load(file)

    # create a dataframe to keep the results
    test_dict = tests['Tests']
    results = pd.DataFrame(test_dict)
    results["Episode"] = ""
    results['Max average score'] = ""

    for i, test in enumerate(tests['Tests']):

        env = gym.make(test['env'])
        env.reset()

        actor_critic = ActorCritic(env, test['episodes'], test['max_score'],
                                   test['hidden_size'], test['gamma'],
                                   test['save'])

        ## run training
        best_score, episode, rew_hist = actor_critic.train()

        results.loc[i, 'Episode'] = episode
        results.loc[i, 'Max average score'] = best_score

        plot_graphs(test, rew_hist)

        # save results to csv file
        filename = 'results/' + 'test_table.csv'
        results.to_csv(filename)

    return results
Ejemplo n.º 13
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 eps=0.2,
                 gamma=0.99,
                 lambda_=0.95,
                 K_epoch=80,
                 batch_size=64):
        super(PPO, self).__init__()
        self.eps = eps
        self.gamma = gamma
        self.lambda_ = lambda_
        self.K_epoch = K_epoch
        self.batch_size = batch_size

        self.model = ActorCritic(state_dim, action_dim)
        self.model_old = ActorCritic(state_dim, action_dim)
        for param in self.model_old.parameters():
            param.requires_grad = False
        self.copy_weights()
Ejemplo n.º 14
0
    def _create_actor_critic(self, is_target=False):
        name = 'target_actor_critic' if is_target else 'actor_critic'
        log_tensorboard = False if is_target else True
        actor_critic = ActorCritic(name,
                                   self._args,
                                   self.env_info,
                                   self.action_size,
                                   reuse=self.reuse,
                                   log_tensorboard=log_tensorboard,
                                   is_target=is_target)

        return actor_critic
Ejemplo n.º 15
0
    def __init__(self, parameters):

        self.parameters = parameters
        self.env = gym.make(self.parameters['env'])
        self.nA = self.env.action_space.sample().shape[0]
        self.state_size = self.env.reset().shape[0]

        # Build our replay memory
        self.memory = Memory(replay_size=self.parameters['replay_size'],
                             action_size=self.nA,
                             state_size=self.state_size,
                             batch_size=self.parameters['batch_size'])

        # Create actor and critic
        self.actor_critic = ActorCritic(
            actor_lr=parameters['actor_learning_rate'],
            critic_lr=parameters['critic_learning_rate'],
            gamma=parameters['gamma'],
            state_size=self.state_size,
            action_size=self.nA,
            tau=parameters['tau'])
Ejemplo n.º 16
0
def main():  
    human_model = ActorCritic()
    human_model.load_state_dict(torch.load('ac_para.pkl'))

    env = gym.make('CartPole-v1')
    model = AskActorCritic()    
    print_interval = 20
    score = 0.0

    for n_epi in range(10000):
        done = False
        s = env.reset()
        step,ask_step = 0,0
        while not done:
            for t in range(n_rollout):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()
                if a == 2: # human action
                    prob = human_model.pi(torch.from_numpy(s).float())
                    m = Categorical(prob)
                    a = m.sample().item()
                    model.put_human_data((s, a))
                    ask_step += 1
                
                s_prime, r, done, info = env.step(a)

                model.put_data((s,a,r,s_prime,done))
                
                s = s_prime
                score += r
                step += 1
                if done:
                    break                     
            
            model.train_net()
            
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.1f}, ask rate : {:.2f}".format(n_epi, score/print_interval, ask_step/step))
            score = 0.0
def main():

    pixels = (
        (0.0, 1.0, 1.0),
        (0.0, 1.0, 0.0),
        (0.0, 0.0, 1.0),
        (1.0, 1.0, 1.0),
        (1.0, 1.0, 0.0),
        (0.0, 0.0, 0.0),
        (1.0, 0.0, 0.0),
    )
    pixel_to_categorical = {pix: i for i, pix in enumerate(pixels)}
    num_pixels = len(pixels)

    #For each mode in MiniPacman there are different rewards
    mode_rewards = {
        "regular": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        "avoid": [0.1, -0.1, -5, -10, -20],
        "hunt": [0, 1, 10, -20],
        "ambush": [0, -0.1, 10, -20],
        "rush": [0, -0.1, 9.9]
    }
    reward_to_categorical = {
        mode: {reward: i
               for i, reward in enumerate(mode_rewards[mode])}
        for mode in mode_rewards.keys()
    }

    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n

    env_model = EnvModel(envs.observation_space.shape, num_pixels,
                         len(mode_rewards["regular"]))
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(env_model.parameters())
Ejemplo n.º 18
0
class StateController(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 action_bound=0.4,
                 training_batch_size=32,
                 GAMMA=0.95,
                 lr=0.001,
                 replay_buffer_size=1024):

        new_state_dim = 2 * state_dim
        self.state_dim = state_dim
        self.AC = ActorCritic(
            new_state_dim,
            action_dim,
            action_bound=action_bound,
            training_batch_size=training_batch_size,
            GAMMA=GAMMA,
            lr=lr,
            replay_buffer_size=replay_buffer_size)

    def get_reward(self, resulting_state, goal_state):
        return np.sum(((resulting_state - goal_state)**2), 1)

    def add_to_replay_buffer(self, state, goal_state, action, resulting_state):
        combined_state = np.concatenate(
            state, goal_state)  #combined is state plus goal
        reward = self.get_reward(resulting_state,
                                 goal_state)  # But reward is result - goal
        real_resulting_state = np.concatenate(resulting_state, goal_state)
        self.AC.add_to_replay_buffer(combined_state, action, reward,
                                     real_resulting_state)

    def add_batch_to_replay_buffer(self, states, goal_states, actions,
                                   resulting_states):
        for s, gs, a, rs in zip(states, goal_states, actions, rewards,
                                resulting_states):
            self.AC.add_to_replay_buffer(s, gs, a, rs)

    def train_from_replay_buffer(self):
        self.AC.train_from_replay_buffer()

    def get_actions(self, states, goal_states):
        combined_states = np.concatenate((states, goal_states), 1)
        return self.AC.get_actions(combined_states)

    def get_random_visited_state(self):
        return self.AC.get_batch(1)[0][0][0:self.state_dim]
Ejemplo n.º 19
0
    def create_network(self):
        # for actor network
        self.o_stats = Normalizer(size=self.dimo,
                                  eps=self.norm_eps,
                                  default_clip_range=self.norm_clip)
        if self.use_goal:
            self.g_stats = Normalizer(size=self.dimg,
                                      eps=self.norm_eps,
                                      default_clip_range=self.norm_clip)
        else:
            self.g_stats = None

        self.main = ActorCritic(self.o_stats, self.g_stats, self.input_dims,
                                self.use_goal).to(self.device)
        self.target = ActorCritic(self.o_stats, self.g_stats, self.input_dims,
                                  self.use_goal).to(self.device)
        self.target.actor = copy.deepcopy(self.main.actor)
        self.target.critic = copy.deepcopy(self.main.critic)

        self.actor_optimizer = optim.Adam(self.main.actor.parameters(),
                                          lr=self.pi_lr)
        self.critic_optimizer = optim.Adam(self.main.critic.parameters(),
                                           lr=self.Q_lr)
    def __init__(self, env_id, input_shape, n_actions, icm, n_threads=8):
        names = [str(i) for i in range(1, n_threads + 1)]

        global_actor_critic = ActorCritic(input_shape, n_actions)
        global_actor_critic.share_memory()
        global_optim = SharedAdam(global_actor_critic.parameters())

        if not icm:
            global_icm = None
            global_icm_optim = None
        else:
            global_icm = ICM(input_shape, n_actions)
            global_icm.share_memory()
            global_icm_optim = SharedAdam(global_icm.parameters())

        self.ps = [
            mp.Process(target=worker,
                       args=(name, input_shape, n_actions, global_actor_critic,
                             global_icm, global_optim, global_icm_optim,
                             env_id, n_threads, icm)) for name in names
        ]

        [p.start() for p in self.ps]
        [p.join() for p in self.ps]
Ejemplo n.º 21
0
def go(resolution):
    env = Env()
    actor = ActorCritic(env, DISCOUNT)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(sess, 'model.ckpt')

        ps = np.linspace(-1, 1, num=resolution)
        vs = np.linspace(-1, 1, num=resolution)

        states = []
        for v in vs:
            for p in ps:
                states.append([p, v])

        states = np.reshape(np.array(states), [resolution * resolution, 2])
        values = actor.get_value(states, sess)
        values = np.reshape(values, [resolution, resolution])
        plt.imshow(values, origin='lower')

        plt.title('Value(position, velocity)')

        minx, maxx = env.low[0], env.high[0]
        xticks = map(lambda x: '{:.2f}'.format(x), np.linspace(minx, maxx, num=10))
        plt.xticks(np.linspace(0, resolution, num=10), xticks)
        plt.xlabel('Position')

        miny, maxy = env.low[1], env.high[1]
        plt.ylabel('Velocity')
        yticks = map(lambda y: '{:.2f}'.format(y), np.linspace(miny, maxy, num=10))
        plt.yticks(np.linspace(0, resolution, num=10), yticks)

        plt.show()
Ejemplo n.º 22
0
def go(logger, render):
    env = Env()
    actor = ActorCritic(env, DISCOUNT)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(sess, 'model.ckpt')
        logger.info('Model restored')

        steps = []
        for episode in range(STEPS_TO_WIN):
            steps.append(play_episode(env, actor, render, sess))

        logger.info('Mean score over {} episodes: {:.2f}'.format(STEPS_TO_WIN, np.mean(steps)))
Ejemplo n.º 23
0
    def __init__(self, ids, env, session, global_optimizer,
                 global_max_timesteps, state_size, action_size):

        self.ids = ids
        self.env = env

        self.agent_name = "agent_id_" + str(ids)
        self.agent_scope = "agent_id_" + str(ids)

        self.tf_session = session
        self.state_size = state_size
        self.action_size = action_size

        self.local_agent = ActorCritic(
            self.state_size, self.action_size, self.agent_scope,
            global_optimizer)  #create the AC network
        self.initial_local_ops = self.swap_tf_ops(
            'global', self.agent_scope)  # get global ops to reset local agents

        self.global_max_timesteps = global_max_timesteps
        self.MAX_TIMESTEP_PER_EPISODE = 500
        self.buffer_length = 10  # len of the buffer
        self.gamma = 0.999
Ejemplo n.º 24
0
    def __init__(self, GUI):
        self.GUI = GUI
        self.env = self.new_env()  # Модель маятника из OpenAI Gym

        self._S_LEN = self.env.observation_space.shape[0]
        self._A_BOUND = self.env.action_space.high

        # Используемые модели
        self.learning_models_list = [
            QLearningModel(state_len=self._S_LEN,
                           action_len=1,
                           a_bound=self._A_BOUND),
            QLearningModel2(state_len=self._S_LEN,
                            action_len=1,
                            a_bound=self._A_BOUND),
            ActorCritic(state_len=self._S_LEN,
                        action_len=1,
                        a_bound=self._A_BOUND),
            ActorCriticDDPG(state_len=self._S_LEN,
                            action_len=1,
                            a_bound=self._A_BOUND)
        ]

        self.learning_model = self.learning_models_list[
            0]  # Ссылка на применяемую модель

        self.s = np.zeros(self._S_LEN)  # Предыдущее состояние маятника
        self.reset_env()  # Перезапустить маятник

        self.is_learning = True  # Производить обучение
        self.working = False  # В данный момент производится управление маятником
        self.endless = False  # Не перезапускать маятник по завершении эпизода

        self.max_ep_steps = 200  # Число шагов в эпизоде

        self.steps = 0  # Число совершённых шагов
def train_model(config, gpu_id, save_dir, exp_name):

    env = gym.make(config['env_name'])

    env.seed(1234)
    torch.manual_seed(1234)
    np.random.seed(1234)

    actor = MLP(len(env.observation_space.sample()),
                config['hidden_layers'],
                env.action_space.n,
                "distribution",
                "relu",
                "standard",
                name="ActorNetwork",
                verbose=True)
    critic = MLP(len(env.observation_space.sample()),
                 config['hidden_layers'],
                 1,
                 "real_values",
                 "relu",
                 "standard",
                 name="CriticNetwork",
                 verbose=True)

    agent = ActorCritic(actor,
                        critic,
                        config['gamma'],
                        lr_critic=1e-3,
                        lr_actor=1e-5,
                        decay_critic=0.9,
                        decay_actor=0.9,
                        use_cuda=config['use_cuda'],
                        gpu_id=gpu_id)
    """
    if config['resume']:
        agent.load_policy(directory=os.path.join(save_dir, exp_name))
    """

    # TRAINING LOOP
    episode_number = 0
    running_average = None
    loss_tape, episode_lengths = [], []
    while episode_number < config['max_episodes']:

        # Book Keeping
        episode_number += 1
        observation = env.reset()
        reward_list = []
        agent.set_state(observation)

        done = False
        t = 0
        # RUN ONE EPISODE
        while not (done) and t < config['max_steps']:
            action = agent.select_action(observation)
            observation, reward, done, _ = env.step(action)

            if config['env_name'] == "MountainCar-v0":
                done = bool(observation[0] >= 0.5)

            if config['render']:
                env.render()

            if episode_number in config['video_ckpt']:
                image = env.render(mode='rgb_array')
                video_folder = os.path.join(
                    save_dir, exp_name, "video_ckpts".format(episode_number))
                if not os.path.exists(video_folder):
                    os.makedirs(video_folder)
                plt.imsave(
                    os.path.join(video_folder,
                                 "ep{}_{}.png".format(episode_number, t)),
                    image)

            # UPDATE THE PARAMETERS (for Temporal-Difference method)
            agent.compute_gradients(action)
            agent.update_parameters(observation, reward)

            reward_list.append(reward)
            agent.set_state(observation)
            t += 1

        # More book-keeping
        episode_lengths.append(len(reward_list))
        if running_average is None:
            running_average = np.sum(reward_list)
        else:
            running_average = running_average * 0.9 + np.sum(reward_list) * 0.1
        print("Episode: {}, reward: {}, average: {:.2f}".format(
            episode_number, np.sum(reward_list), running_average))

        if episode_number % config['chkp_freq'] == 0:
            #agent.save_policy(directory=os.path.join(save_dir, exp_name))
            utils.save_results_classicControl(save_dir, exp_name,
                                              episode_lengths, config)

    env.close()
Ejemplo n.º 26
0
    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n

    env_model = EnvModel(envs.observation_space.shape, envs.action_space.n, num_pixels, len(mode_rewards["regular"]))
    actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(env_model.parameters())

    env_model = env_model.to(DEVICE)
    actor_critic = actor_critic.to(DEVICE)

    checkpoint = torch.load(os.path.join(ACTOR_CRITIC_PATH, "actor_critic_checkpoint"))
    actor_critic.load_state_dict(checkpoint['actor_critic_state_dict'])


    reward_coef = 0.1
    num_updates = args.epoch

    losses = []
Ejemplo n.º 27
0
class DDPG():
    def __init__(self, parameters):

        self.parameters = parameters
        self.env = gym.make(self.parameters['env'])
        self.nA = self.env.action_space.sample().shape[0]
        self.state_size = self.env.reset().shape[0]

        # Build our replay memory
        self.memory = Memory(replay_size=self.parameters['replay_size'],
                             action_size=self.nA,
                             state_size=self.state_size,
                             batch_size=self.parameters['batch_size'])

        # Create actor and critic
        self.actor_critic = ActorCritic(
            actor_lr=parameters['actor_learning_rate'],
            critic_lr=parameters['critic_learning_rate'],
            gamma=parameters['gamma'],
            state_size=self.state_size,
            action_size=self.nA,
            tau=parameters['tau'])

    def train(self):

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        # Create global step and increment operation
        global_step_tensor = tf.Variable(0,
                                         trainable=False,
                                         name='global_step')
        increment_global_step = tf.assign_add(global_step_tensor, 1)

        # Create model saver
        saver = tf.train.Saver()

        sess = tf.Session(config=config)

        if not self.parameters['restore']:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        self.actor_critic.set_moving_to_target(sess)
        run_id = np.random.randint(10000)

        trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id),
                                            graph=sess.graph)

        # Get action noise
        action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.nA),
            sigma=float(self.parameters['sigma']) * np.ones(self.nA))

        # Fill Replay Memory
        state = self.env.reset()
        fill_amount = 0
        while fill_amount < self.parameters['replay_init_size']:

            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)

            if done:
                state = self.env.reset()
            else:
                fill_amount += 1
                self.memory.add(state, action, reward, done, next_state)
                state = next_state

        # Main Loop
        steps = 0

        for i in range(self.parameters['num_epochs']):

            avg_epoch_rewards = 0
            num_epochs = 1
            for e in range(self.parameters['num_episodes']):

                state = self.env.reset()

                ep_reward = 0

                # Perform rollout
                while True:
                    noise = action_noise()
                    action = self.actor_critic.pi(sess, state[None, ...])
                    action += noise
                    action = np.clip(action, self.env.action_space.low[0],
                                     self.env.action_space.high[0])

                    assert action.shape == self.env.action_space.shape
                    """
					# UNCOMMENT TO PRINT ACTIONS
					a0 = tf.Summary(value=[tf.Summary.Value(tag="action_0", simple_value=action[0,0])])
					trainwriter.add_summary(a0,steps)
					a1 = tf.Summary(value=[tf.Summary.Value(tag="action_1", simple_value=action[0,1])])
					trainwriter.add_summary(a1,steps)
					a2 = tf.Summary(value=[tf.Summary.Value(tag="action_2", simple_value=action[0,2])])
					trainwriter.add_summary(a2,steps)
					steps += 1
					"""

                    next_state, reward, done, _ = self.env.step(action)

                    self.memory.add(state, action, reward, done, next_state)

                    if self.parameters['render_train']:
                        self.env.render()

                    ep_reward += reward

                    if done:

                        reward_summary = tf.Summary(value=[
                            tf.Summary.Value(tag="ep_rewards",
                                             simple_value=ep_reward)
                        ])
                        trainwriter.add_summary(
                            reward_summary,
                            i * self.parameters['num_episodes'] + e)
                        action_noise.reset()
                        break

                    state = next_state

                avg_epoch_rewards = avg_epoch_rewards + (
                    ep_reward - avg_epoch_rewards) / num_epochs
                num_epochs += 1

                # Perform train
                for t in range(self.parameters['num_train_steps']):
                    s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample(
                    )
                    # Train actor critic model
                    self.actor_critic.update(sess=sess,
                                             filewriter=trainwriter,
                                             state_batch=s_state,
                                             next_state_batch=s_next_state,
                                             action_batch=s_action,
                                             reward_batch=s_reward,
                                             done_batch=s_terminal)
                    sess.run(increment_global_step)

            # Print out epoch stats here

            table_data = [['Epoch', 'Average Reward'],
                          [
                              str(i) + "/" +
                              str(self.parameters['num_epochs']),
                              str(avg_epoch_rewards)
                          ]]

            table = AsciiTable(table_data, "Training Run: " + str(run_id))

            save_path = saver.save(sess, "./saves/model.ckpt")

            os.system('clear')
            print("Model saved in path: %s" % save_path + "\n" + table.table)

    def test(self):
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        saver = tf.train.Saver()
        sess = tf.Session(config=config)

        saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        while True:

            state = self.env.reset()

            # Perform rollout
            while True:
                action = self.actor_critic.pi(sess, state[None, ...])
                action = np.clip(action, self.env.action_space.low[0],
                                 self.env.action_space.high[0])

                assert action.shape == self.env.action_space.shape

                next_state, reward, done, _ = self.env.step(action)

                self.env.render()

                if done:

                    break

                state = next_state
Ejemplo n.º 28
0
class DDPG():
    def __init__(self, parameters):

        self.parameters = parameters
        self.env = gym.make(
            self.parameters['env'][:self.parameters['env'].find('_')])
        self.nA = self.env.action_space.sample().shape[0]
        self.state_size = self.env.reset().shape[0]

        # Build our replay memory
        self.memory = Memory(replay_size=self.parameters['replay_size'],
                             action_size=self.nA,
                             state_size=self.state_size,
                             batch_size=self.parameters['batch_size'])

        # Create actor and critic
        self.actor_critic = ActorCritic(
            actor_lr=parameters['actor_learning_rate'],
            critic_lr=parameters['critic_learning_rate'],
            gamma=parameters['gamma'],
            state_size=self.state_size,
            action_size=self.nA,
            tau=parameters['tau'])

    def train(self):

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        # Create global step and increment operation
        global_step_tensor = tf.Variable(0,
                                         trainable=False,
                                         name='global_step')
        increment_global_step = tf.assign_add(global_step_tensor, 1)

        # Create model saver
        saver = tf.train.Saver(max_to_keep=None)

        sess = tf.Session(config=config)

        if not self.parameters['restore']:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        self.actor_critic.set_moving_to_target(sess)
        run_id = np.random.randint(10000)

        trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id),
                                            graph=sess.graph)

        # Get action noise
        action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.nA),
            sigma=float(self.parameters['sigma']) * np.ones(self.nA))

        # Fill Replay Memory
        state = self.env.reset()
        fill_amount = 0
        while fill_amount < self.parameters['replay_init_size']:

            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)

            if done:
                state = self.env.reset()
            else:
                fill_amount += 1
                self.memory.add(state, action, reward, done, next_state)
                state = next_state

        # Main Loop
        plots = {'critic_loss': [], 'actor_loss': [], 'episode_reward': []}

        plots_dir = './plots/'
        weights_dir = './weights/'
        graph_dir = './graph/'
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)
        if not os.path.exists(weights_dir):
            os.makedirs(weights_dir)
        if not os.path.exists(graph_dir):
            os.makedirs(graph_dir)

        saver.export_meta_graph(graph_dir + self.parameters['env'] +
                                '/graph.meta')

        #cumulative step counter
        cumu_step = 0

        for i in range(self.parameters['num_epochs']):

            avg_epoch_rewards = 0
            n_epochs = 1

            for e in range(self.parameters['num_episodes']):

                state = self.env.reset()

                ep_reward = 0
                ep_n_action = 0

                # Perform rollout
                for _ in range(500):
                    noise = action_noise()
                    action = self.actor_critic.pi(sess, state[None, ...])
                    action += noise
                    action = np.clip(action, self.env.action_space.low[0],
                                     self.env.action_space.high[0])

                    assert action.shape == self.env.action_space.shape

                    next_state, reward, done, _ = self.env.step(action)
                    # print(action)
                    # print(next_state)
                    # print(reward)

                    self.memory.add(state, action, reward, done, next_state)

                    if self.parameters['render_train']: self.env.render()

                    ep_reward += reward
                    ep_n_action += 1
                    cumu_step += 1
                    state = next_state

                    # Perform train
                    avg_critic_loss = 0.0
                    avg_actor_loss = 0.0
                    for t in range(self.parameters['num_train_steps']):
                        s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample(
                        )
                        # Train actor critic model
                        _, _, critic_loss, actor_loss = self.actor_critic.update(
                            sess=sess,
                            filewriter=trainwriter,
                            state_batch=s_state,
                            next_state_batch=s_next_state,
                            action_batch=s_action,
                            reward_batch=s_reward,
                            done_batch=s_terminal)
                        avg_critic_loss += critic_loss
                        avg_actor_loss += actor_loss

                        sess.run(increment_global_step)

                    avg_critic_loss /= self.parameters['num_train_steps']
                    avg_actor_loss /= self.parameters['num_train_steps']

                    if done:
                        reward_summary = tf.Summary(value=[
                            tf.Summary.Value(tag="ep_rewards",
                                             simple_value=ep_reward)
                        ])
                        trainwriter.add_summary(
                            reward_summary,
                            i * self.parameters['num_episodes'] + e)
                        action_noise.reset()
                        break

                avg_epoch_rewards = avg_epoch_rewards + (
                    ep_reward - avg_epoch_rewards) / n_epochs
                n_epochs += 1


                print('Epoch: {:d} | Reward: {:d} | Avg_Q_loss: {:.4f} | Avg_a_loss: {:.4f} | Episode: {:d} | Step: {:d} | Cumu Step: {:d}'\
                 .format(i+1, int(ep_reward), avg_critic_loss, avg_actor_loss, e+1, ep_n_action, cumu_step))

                if e % 19 == 0:
                    save_path = saver.save(
                        sess,
                        weights_dir + self.parameters['env'] + '/model.ckpt',
                        global_step=i * e + 1)

                plots['episode_reward'].append(ep_reward)
                plots['critic_loss'].append(critic_loss)
                plots['actor_loss'].append(critic_loss)

                pickle.dump(
                    plots,
                    open(plots_dir + self.parameters['env'] + '_plot.pickle',
                         'wb'))

    def test(self):
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        saver = tf.train.Saver()
        sess = tf.Session(config=config)

        saver.restore(
            sess,
            tf.train.latest_checkpoint(
                './weights/HalfCheetah-v2_kirkiles_train50episode_noise_norm_bufsize1Mi1k'
            ))

        while True:
            state = self.env.reset()
            # Perform rollout
            while True:
                action = self.actor_critic.pi(sess, state[None, ...])
                action = np.clip(action, self.env.action_space.low[0],
                                 self.env.action_space.high[0])
                #print(action)
                assert action.shape == self.env.action_space.shape
                next_state, reward, done, _ = self.env.step(action)
                self.env.render()

                if done:
                    break

                state = next_state
class Runner(object):
    def __init__(self, env, GOAL_STATE, GAMMA=0.95, lr=0.001):
        self.env = env
        self.GOAL_STATE = GOAL_STATE
        self.states_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.actor_critic = ActorCritic(
            self.states_dim, self.action_dim, GAMMA=GAMMA, lr=lr)
        self.min_spread_holder = MinSpreadHolder(self.states_dim)

    def render_if_true(self, render):
        if render:
            self.env.render()

    def get_reward(self, state):
        shifted_goal_state = self.shift_observation(self.GOAL_STATE)
        diff = state - shifted_goal_state
        reward = -1 * np.mean(np.multiply(diff, diff))
        return reward

    def add_observed_batch(self, obs_batch):
        self.min_spread_holder.add_batch(obs_batch)

    def shift_observation(self, obs):
        return self.min_spread_holder.transform(obs)

    def play_random_game(self, render=True, add_to_all_observations=False):
        env = self.env
        observation = env.reset()
        games_observations = []

        for t in range(1000):
            games_observations.append(observation)
            self.render_if_true(render)
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                if add_to_all_observations:
                    self.add_observed_batch(np.asarray(games_observations))
                print('Episode finished after {} timesteps'.format(t + 1))
                break

    def play_game_from_actor_with_random(self,
                                         render=True,
                                         add_to_buffer=True,
                                         prob_random=0.0):
        games_observations = []
        env = self.env
        obs = env.reset()
        games_observations = []
        for t in range(1000):
            self.render_if_true(render)
            obs = np.asarray(obs)
            games_observations.append(obs)
            shifted_obs = self.shift_observation(obs)

            action = self.actor_critic.get_actions(
                np.asarray([shifted_obs]))[0]  # I think zero.
            if not render and (random.random() < prob_random):
                action = env.action_space.sample()
            # if not render:
            #     for i in range(len(action)):
            #         if random.random() < prob_random:
            #             action[i] = (random.random() * 0.8) - 0.4

            new_obs, reward, done, info = env.step(action)
            shifted_new_obs = self.shift_observation(new_obs)
            if add_to_buffer:
                # real_reward = 0.0 if not done else -1.0
                real_reward = self.get_reward(
                    shifted_new_obs) if not done else -2.0
                self.actor_critic.add_to_replay_buffer(
                    shifted_obs, action, real_reward, shifted_new_obs)
            if done:
                self.add_observed_batch(np.asarray(games_observations))
                print('Episode finished after {} timesteps'.format(t + 1))
                break

            obs = new_obs

    def train_from_replay_buffer(self, should_print):
        losses = self.actor_critic.train_from_replay_buffer(should_print)
        return np.mean(losses)
Ejemplo n.º 30
0
class Runner(object):
    def __init__(self, env, GAMMA=0.5):
        self.env = env
        self.states_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.actor_critic = ActorCritic(self.states_dim,
                                        self.action_dim,
                                        lr=0.0000000001)
        self.all_observations = np.asarray([])

    def get_means_stddevs(self, num_games=100, min_std_dev=0.01):
        observations = []
        env = self.env
        for i in xrange(num_games):
            obs = env.reset()
            while True:
                observations.append(obs)
                action = env.action_space.sample()
                obs, reward, done, info = env.step(action)
                if done:
                    print('game {} done'.format(i))
                    break
        observations = np.asarray(observations)
        mean = np.mean(observations, axis=0)
        stddev = np.maximum(np.std(observations, axis=0), min_std_dev)
        return mean, stddev

    def write_mean_stddev_to_file(self, num_games=100, min_std_dev=0.01):
        mean, stddev = self.get_means_stddevs(num_games, min_std_dev)
        with open('./mujoco_data/mean_state.json', 'w') as f:
            f.write(json.dumps(mean.tolist()))
        with open('./mujoco_data/stddev_state.json', 'w') as f:
            f.write(json.dumps(stddev.tolist()))
        print('written')

    def get_min_spread(self, num_games=100, min_spread=0.05):
        observations = []
        env = self.env
        for i in xrange(num_games):
            obs = env.reset()
            while True:
                observations.append(obs)
                action = env.action_space.sample()
                obs, reward, done, info = env.step(action)
                if done:
                    print('game {} done'.format(i))
                    break
        observations = np.asarray(observations)
        min_obs = observations.min(axis=0)
        max_obs = observations.max(axis=0)
        spread = np.maximum(max_obs - min_obs, min_spread)
        return min_obs, spread

    def write_min_spread_to_file(self, num_games=100, min_spread=0.05):
        min_obs, spread = self.get_min_spread(num_games, min_spread)
        print(min_obs)
        print(spread)
        print(min_obs.shape, spread.shape)
        with open('./mujoco_data/min_state.json', 'w') as f:
            f.write(json.dumps(min_obs.tolist()))
        with open('./mujoco_data/spread_state.json', 'w') as f:
            f.write(json.dumps(spread.tolist()))
        print('written')

    def play_random_game(self, render=True):
        env = self.env
        observation = env.reset()

        for t in range(1000):
            if render == True:
                env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print('Episode finished after {} timesteps'.format(t + 1))
                break

    def play_game_from_actor(self, render=True, add_to_buffer=True):
        env = self.env
        obs = env.reset()
        for t in range(1000):
            if render == True:
                env.render()
                sleep(0.05)
            obs = np.asarray(obs)
            shifted_obs = shift_state(obs)
            action = self.actor_critic.get_actions(np.asarray(
                [shifted_obs]))[0]  # I think zero.
            new_obs, reward, done, info = env.step(action)

            if done:
                print('Episode finished after {} timesteps'.format(t + 1))
                break

            if add_to_buffer:
                shifted_new_obs = shift_state(new_obs)
                # real_reward = get_reward(shifted_obs, shifted_new_obs)
                real_reward = get_reward(shifted_new_obs)
                self.actor_critic.add_to_replay_buffer(shifted_obs, action,
                                                       real_reward,
                                                       shifted_new_obs)

            obs = new_obs

    def play_game_from_actor_with_random(self,
                                         render=True,
                                         add_to_buffer=True,
                                         prob_random=0.05):
        env = self.env
        obs = env.reset()
        for t in range(1000):
            if render == True:
                env.render()
                sleep(0.01)
            obs = np.asarray(obs)
            shifted_obs = shift_state(obs)

            action = self.actor_critic.get_actions(np.asarray(
                [shifted_obs]))[0]  # I think zero.
            if not render:
                for i in range(len(action)):
                    if random.random() < prob_random:
                        action[i] = (random.random() * 0.8) - 0.4

            # random_move = random.random() < prob_random
            # if random_move and not render:
            #     print('Random move!')
            #     action = env.action_space.sample()
            # else:
            #     action = self.actor_critic.get_actions(
            #         np.asarray([shifted_obs]))[0]  # I think zero.
            new_obs, reward, done, info = env.step(action)

            if done:
                print obs, '\n'
                print new_obs, '\n'
                print shifted_obs, '\n'
                exit()
                if add_to_buffer:
                    real_reward = -0.10
                    self.actor_critic.add_to_replay_buffer(
                        shifted_obs, action, real_reward, shifted_obs)
                print('Episode finished after {} timesteps'.format(t + 1))
                break

            if add_to_buffer:
                shifted_new_obs = shift_state(new_obs)
                # real_reward = get_reward(shifted_obs, shifted_new_obs)
                real_reward = get_reward(shifted_new_obs)
                self.actor_critic.add_to_replay_buffer(shifted_obs, action,
                                                       real_reward, new_obs)

            obs = new_obs

    def train_from_replay_buffer(self, should_print):
        losses = self.actor_critic.train_from_replay_buffer(should_print)
        return np.mean(losses)