Esempio n. 1
0
    def __init__(self, state_size, action_size, random_seed, agent_size=1):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.agent_size = agent_size

        self.local_actor = ActorNetwork(state_size, action_size,
                                        random_seed).to(device)
        self.target_actor = ActorNetwork(state_size, action_size,
                                         random_seed).to(device)
        self.local_critic = CriticNetwork(state_size, action_size,
                                          random_seed).to(device)
        self.target_critic = CriticNetwork(state_size, action_size,
                                           random_seed).to(device)

        self.opt_actor = optim.Adam(self.local_actor.parameters(), lr=LR_ACTOR)
        self.opt_critic = optim.Adam(self.local_critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
    def __init__(self, state_size, action_size, n_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.stacked_state_size = state_size * n_agents
        self.stacked_action_size = action_size * n_agents

        # Actor networks
        self.actor_local = ActorNetwork(state_size, action_size,
                                        seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)

        # Critic networks
        self.critic_local = CriticNetwork(self.stacked_state_size,
                                          self.stacked_action_size,
                                          seed).to(device)
        self.critic_target = CriticNetwork(self.stacked_state_size,
                                           self.stacked_action_size,
                                           seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR)

        # OUNoise
        self.exploration_noise = OUNoise(action_size, seed)
Esempio n. 3
0
    def __init__(self, state_size, action_size, agent_id):

        self.state_size  = state_size
        self.action_size = action_size
        self.seed        = args['seed']
        self.device      = args['device']
        #self.args        = args

        # Q-Network
        self.actor_network    = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_target     = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_optimizer  = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR'])
        
        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        #if not agent_id:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #else:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        
        # Replay memory
        self.memory      = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed)
        
        # Noise process
        self.noise       = OUNoise(action_size, self.seed)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step      = 0
        
        self.mCriticLoss = 0
        
        self.actorLoss   = 0
Esempio n. 4
0
    def __init__(self, state_size, obs_size, action_size, num_agents):
        super(DDPGAgent, self).__init__()

        #self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        #self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        #self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        #self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        self.actor = ActorNetwork(obs_size, action_size).to(device)
        self.critic = CriticNetwork(state_size,
                                    action_size * num_agents).to(device)
        self.target_actor = ActorNetwork(obs_size, action_size).to(device)
        self.target_critic = CriticNetwork(state_size,
                                           action_size * num_agents).to(device)

        #self.noise = OUNoise(out_actor, scale=1.0 )
        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=LR_ACTOR)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)
Esempio n. 5
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # create the local and target actor networks
        self.actor_local = ActorNetwork(state_size, action_size,
                                        seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         seed).to(device)

        # create the local and target critic networks
        self.critic_local = CriticNetwork(state_size, action_size,
                                          seed).to(device)
        self.critic_target = CriticNetwork(state_size, action_size,
                                           seed).to(device)

        # optimizers for local actor and critic
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR,
                                           weight_decay=0.0)

        # MSE loss for updating the critic
        self.critic_loss_function = nn.MSELoss()

        # ensure that the local and target networks are initialized with the same random weights
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor_local.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic_local.parameters()):
            target_param.data.copy_(param.data)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # init the noise class to sample from
        self.noise = GaussianNoise(self.action_size)
Esempio n. 6
0
def create_model(context):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    context.sess = tf.InteractiveSession(config=config)

    context.actor = ActorNetwork(
        context.sess, [len(context.assets), context.n,
                       len(context.features)],
        len(context.assets) + 1, context.actor_learning_rate, context.tau,
        context.minibatch_size)

    context.critic = CriticNetwork(
        context.sess, [len(context.assets), context.n,
                       len(context.features)],
        len(context.assets) + 1, context.critic_learning_rate, context.tau,
        context.gamma, context.actor.get_num_trainable_vars())

    # Inicializar las variables de Tensorflow
    context.sess.run(tf.global_variables_initializer())

    context.saver = tf.train.Saver()

    # Inicializar los pesos de las redes objetivo
    context.actor.update_target_network()
    context.critic.update_target_network()
Esempio n. 7
0
    def __init__(self, state_size, action_size, random_seed, learning_rate,
                 noise, device):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.learning_rate = learning_rate

        self.actor_local = ActorNetwork(state_size, action_size,
                                        random_seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         random_seed).to(device)
        hard_update(self.actor_target, self.actor_local)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate)

        self.noise = noise
        self.device = device
Esempio n. 8
0
    def __init__(self, state_size, action_size, memory, seed=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        
        self.state_size = state_size
        self.action_size = action_size
        
        if seed is not None:
            self.seed = seed

        # create the local and target actor networks
        self.actor_local = ActorNetwork(state_size, action_size, seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size, seed).to(device)
        
        # create the local and target critic networks
        self.critic_local = CriticNetwork(state_size, action_size, seed).to(device)
        self.critic_target = CriticNetwork(state_size, action_size, seed).to(device)
        
        # optimizers for local actor and critic 
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0)
        
        # MSE loss for updating the critic
        # self.critic_loss_function = nn.MSELoss()
        self.critic_loss_function = nn.SmoothL1Loss()

        # copy the local networks weights to the target network 
        self.copy_weights_from_local_to_target()
        
        # Replay memory
        self.memory = memory
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
        # init the noise class to sample from
        self.noise = GaussianNoise(self.action_size)
Esempio n. 9
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 hidden_in_actor=512,
                 hidden_out_actor=256,
                 lr_actor=1e-4,
                 hidden_in_critic=512,
                 hidden_out_critic=256,
                 lr_critic=3e-4,
                 weight_decay_critic=0,
                 seed=1,
                 device='cpu'):
        super(DDPGAgent, self).__init__()

        self.device = device

        # Actor
        self.actor = ActorNetwork(state_size, hidden_in_actor,
                                  hidden_out_actor, action_size,
                                  seed).to(device)
        self.target_actor = ActorNetwork(state_size, hidden_in_actor,
                                         hidden_out_actor, action_size,
                                         seed).to(device)
        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)

        # Target
        self.critic = CriticNetwork(state_size, action_size, num_agents,
                                    hidden_in_critic, hidden_out_critic,
                                    seed).to(device)
        self.target_critic = CriticNetwork(state_size, action_size, num_agents,
                                           hidden_in_critic, hidden_out_critic,
                                           seed).to(device)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)

        # Noise
        self.noise = OUNoise(action_size, seed, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
Esempio n. 10
0
def build_actor_critic(sess, env):
    with tf.variable_scope("model", reuse=None):
        with tf.name_scope("actor"):
            actor = ActorNetwork(sess, env, config, is_training=False)

        with tf.name_scope("critic"):
            critic = CriticNetwork(sess, env, config, is_training=False)

    sess.run(tf.global_variables_initializer())

    return actor, critic
Esempio n. 11
0
def build_actor_critic(sess, env, env_eval):
    w_init = tflearn.initializations.xavier_initializer()

    with tf.variable_scope("model", reuse=None, initializer=w_init):
        with tf.name_scope("actor"):
            actor = ActorNetwork(sess, env, config, is_training=True)

        with tf.name_scope("critic"):
            critic = CriticNetwork(sess, env, config, is_training=True)

        # if config.noise_std:
        #     actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(env.input_cardin * env.state_cardin),
        #                                                sigma=config.noise_std,
        #                                                sigma_dec=config.noise_dec)
        # else:
        #     actor_noise = None

    with tf.variable_scope("model", reuse=True):
        with tf.name_scope("actor"):
            actor_eval = ActorNetwork(sess,
                                      env_eval,
                                      config,
                                      is_training=False)

        with tf.name_scope("critic"):
            critic_eval = CriticNetwork(sess,
                                        env_eval,
                                        config,
                                        is_training=False)

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    return actor, critic, actor_eval, critic_eval  #, actor_noise
    def __init__(self,
        device,
        key,
        state_size, action_size, random_seed,
        memory, noise,
        lr, weight_decay,
        checkpoint_folder = './Saved_Model/'):

        self.DEVICE = device

        self.KEY = key

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.LR = lr
        self.WEIGHT_DECAY = weight_decay

        self.CHECKPOINT_FOLDER = checkpoint_folder

        # Actor Network (w/ Target Network)
        self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR)

        self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth'
        if os.path.isfile(self.checkpoint_full_name):
            self.local.load_state_dict(torch.load(self.checkpoint_full_name))
            self.target.load_state_dict(torch.load(self.checkpoint_full_name))

        # Replay memory
        self.memory = memory

        # Noise process
        self.noise = noise
Esempio n. 13
0
def build_actor_critic(sess, env):
    w_init = tflearn.initializations.xavier_initializer()

    with tf.variable_scope("model", reuse=None, initializer=w_init):
        with tf.name_scope("actor"):
            actor = ActorNetwork(sess, env, config, is_training=True)

        with tf.name_scope("critic"):
            critic = CriticNetwork(sess, env, config, is_training=True)

    sess.run(tf.global_variables_initializer())

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    return actor, critic
Esempio n. 14
0
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)
        self.actor.share_memory()
        self.critic.share_memory()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr)
Esempio n. 15
0
    def __init__(self, path_to_weights, weights_id):

        print("Init ANN")

        self.init = False

        # aviod TF from allocation all GPU mem
        # https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        #sess = tf.Session()
        K.set_session(sess)

        self.path_to_weights = path_to_weights
        self.next_weights_id = weights_id

        #check if path is valid
        if not os.path.isdir(self.path_to_weights):
            print("Folder with weights does not exist!")
            print(self.path_to_weights)
            self.init = False
        else:
            self.init = True

        if self.init:
            #create actor network
            self.actor = ActorNetwork.ActorNetwork(sess, 5, 2)
            #load first weights
            print("Loading first weights for ANN from " + str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5")
            try:
                self.actor.model.load_weights(
                    str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5")
                self.next_weights_id = self.next_weights_id + 1
            except:
                print("Cannot find the weight (.h5) file")
                print(str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5")
                self.init = False
Esempio n. 16
0
    def __init__(self, opt, actor_id, q_trace, learner):
        self.opt = opt
        self.q_trace = q_trace
        self.learner = learner
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed + actor_id)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.n_episodes = 0
        self.n_steps = 0
        self.gamma = opt.gamma

        # epsilon
        self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \
            if opt.n_actors > 1 else 0.4

        # モデル
        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)
Esempio n. 17
0
    def __init__(self, opt, actor_id,  q_trace, learner):
        self.opt = opt
        self.q_trace = q_trace
        self.learner = learner

        self.env = gym.make(self.opt.env)
        self.env_state = self.env.reset()
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        ## パラメータ
        self.batch_size = opt.batch_size
        self.roll_out_n_steps = opt.roll_out_n_steps
        self.gamma = opt.gamma

        self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \
            if opt.n_actors > 1 else 0.4

        self.n_episodes = 0

        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)  # ActorNetwork
        self.critic = CriticNetwork(self.n_state).to(self.device)  # CriticNetwork
Esempio n. 18
0
def main(args):

    with tf.Session() as sess:
        env = gym.make(args['env'])
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high
        # Ensure action bound is symmetric
        # assert (env.action_space.high == -env.action_space.low)

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             float(args['actor_lr']), float(args['tau']),
                             int(args['minibatch_size']))

        critic = CriticNetwork(sess, state_dim, action_dim,
                               float(args['critic_lr']), float(args['tau']),
                               float(args['gamma']),
                               actor.get_num_trainable_vars())

        actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

        if args['train']:
            if not os.path.exists(args['save_dir']):
                os.makedirs(args['save_dir'])
            with open(os.path.join(args['save_dir'], 'config.json'), 'w') as f:
                json.dump(args, f, indent=2)
            train(sess, env, args, actor, critic, actor_noise)
        else:
            # ddpg = []
            # indexes = [e for e in range(400) if e % 10 == 9]
            # indexes = [0] + indexes
            indexes = [399]
            num_test_tasks = 100
            buckets = 1
            successes = []
            directory = args['to_pickle']
            for index in indexes:
                # times = []
                task_success = []
                saver = tf.train.Saver()
                saver.restore(
                    sess, "../final_models/multitask/fixed/{0}/model-{1}.ckpt".
                    format(directory, index))
                for _ in range(buckets):
                    tasks = env.unwrapped.sample_tasks(num_test_tasks)
                    # tasks = [{'goal': np.array([0., 0.])} for e in range(num_test_tasks)]
                    success = 0
                    for task in tasks:
                        s = env.reset_task(task)
                        step = 0
                        d = False
                        while not d:
                            # env.render()
                            action = actor.predict_target(
                                np.reshape(s, (1, actor.s_dim)))[0]
                            step += 1
                            s, r, d, _ = env.step(action)
                        if r == 1:
                            success += 1
                        # times.append(step)
                    env.close()
                    task_success.append(success / num_test_tasks)
                successes.append(task_success)
                # ddpg.append(times)
            # out = [successes, ddpg]
            env.close()
            if not os.path.exists('./pkls'):
                os.makedirs('./pkls')
            with open('./pkls/{0}.pkl'.format(args['save_dir']), 'wb') as f:
                pickle.dump(successes, f)
Esempio n. 19
0
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.autograd import Variable

import gym
import numpy as np
import matplotlib.pyplot as plt

from model import ActorNetwork, CriticNetwork

actor = ActorNetwork(4, 2)
critic = CriticNetwork(4)
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(critic.parameters(), lr=8e-4)
env = gym.make('CartPole-v0')
GAMMA = 0.99
N_EPISODES = 20000
LOG_STEPS = 100
SAVE_STEPS = 100


def select_action(S):
    '''
    select action based on currentr state
    args:
        S: current state
    returns:
        action to take, log probability of the chosen action
    '''