def __init__( self, obs_dim, action_dim, action_gain, actor_learning_rate=0.0001, critic_learning_rate=0.001, gamma=0.99, tau=0.001, ): self.obs_dim = obs_dim self.action_dim = action_dim self.gamma = gamma self.tau = tau # make main networks self.actor = Actor(obs_dim, action_dim, action_gain, actor_learning_rate) self.critic = Critic(obs_dim, action_dim, critic_learning_rate) # make target networks self.target_actor = Actor(obs_dim, action_dim, action_gain) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic = Critic(obs_dim, action_dim) self.target_critic.model.set_weights(self.critic.model.get_weights())
def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = eps_start self.t_step = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, random_seed, num_agents): """Initialize an Agent object. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, sigma=0.1) # Replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.num_agents = num_agents
def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPS #--- actor -----# self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=1e-3) #---- critic -----# self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=1e-3, weight_decay=0) self.noise = OUNoise(action_size, random_seed) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__( self, state_size=24, action_size=2, BATCH_SIZE=128, BUFFER_SIZE=int(1e6), discount_factor=1, tau=1e-2, noise_coefficient_start=5, noise_coefficient_decay=0.99, LR_ACTOR=1e-3, LR_CRITIC=1e-3, WEIGHT_DECAY=1e-3, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")): """ state_size (int): dimension of each state action_size (int): dimension of each action BATCH_SIZE (int): mini batch size BUFFER_SIZE (int): experience storing lenght, keep it as high as possible discount_factor (float): discount factor for calculating Q_target tau (float): interpolation parameter for updating target network noise_coefficient_start (float): value to be multiplied to OUNoise sample noise_coefficient_decay (float): exponential decay factor for value to be multiplied to OUNoise sample LR_ACTOR (float): learning rate for actor network LR_CRITIC (float): learning rate for critic network WEIGHT_DECAY (float): Weight decay for critic network optimizer device : "cuda:0" if torch.cuda.is_available() else "cpu" """ self.state_size = state_size print(device) self.action_size = action_size self.BATCH_SIZE = BATCH_SIZE self.BUFFER_SIZE = BUFFER_SIZE self.discount_factor = discount_factor self.tau = tau self.noise_coefficient = noise_coefficient_start self.noise_coefficient_decay = noise_coefficient_decay self.steps_completed = 0 self.device = device # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(self.device) self.actor_target = Actor(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(self.device) self.critic_target = Critic(state_size, action_size).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((1, action_size)) # Replay memory self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE)
def __init__(self, env: gym.Env, memory_size: int, batch_size: int, ou_noise_theta: float, ou_noise_sigma: float, gamma: float = 0.99, tau: float = 5e-3, initial_random_episode: int = 1e4, name_cases='myproject'): """ Initialize. """ # Logger self.wandb = wandb.init(project=name_cases) obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] self.env = env self.memory = ReplayBuffer(memory_size) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.initial_random_episode = initial_random_episode # noise self.noise = OUNoise( action_dim, theta=ou_noise_theta, sigma=ou_noise_sigma, ) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # networks self.actor = Actor(obs_dim, action_dim).to(self.device) self.actor_target = Actor(obs_dim, action_dim).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic = Critic(obs_dim + action_dim).to(self.device) self.critic_target = Critic(obs_dim + action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizer self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) # transition to store in memory self.transition = list() # total steps count self.total_step = 0 # mode: train / test self.is_test = False self.populate(self.initial_random_episode)
def __init__(self, state_size, action_size, action_sigma=0.1, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2, seed=0): ''' TD3 Agent :param state_size: State Dimension :param action_size: Action dimension :param action_sigma: standard deviation of the noise to be added to the action :param memory_size: :param batch: :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper) :param noise_clip: How much noise to allow :param gamma: :param update_frequency: :param seed: ''' self.state_size = state_size self.action_size = action_size self.action_sigma = action_sigma self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.seed = seed self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) #second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) # second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch, seed=seed) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1)
def create_actor(self, alpha, hidden_layers): params = { 'input_shape': self.env.observation_space.shape, 'output_shape': self.env.action_space.shape, 'hidden_layers': hidden_layers } self.actor = OpenStruct() self.actor.online = Actor("{}.actor.online".format(self.name), **params) self.actor.target = Actor("{}.actor.target".format(self.name), **params)
def __init__(self, n, state_size, action_size, random_seed, params): """Initialize an Agent object. Params ====== n (int): number of agents in env state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed params (dict): dictionary with hyperparameters name-value pairs """ self.n = n self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.BUFFER_SIZE = params["BUFFER_SIZE"] self.BATCH_SIZE = params["BATCH_SIZE"] self.GAMMA = params["GAMMA"] self.TAU = params["TAU"] self.LR_ACTOR = params["LR_ACTOR"] self.LR_CRITIC = params["LR_CRITIC"] self.WEIGHT_DECAY = params["WEIGHT_DECAY"] self.N_UPDATES = params["N_UPDATES"] self.UPDATE_STEP = params["UPDATE_STEP"] # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise(self.n, action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, random_seed) #Count timesteps self.timestep = 0
def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None, buffer_size=150000, batch_size=32, gamma=0.99, replay_alpha=0.5, beta_limit=10000): self.task = Task(init_pose, init_velocities, init_angle_velocities, runtime, target_pos) self.state_size = self.task.state_size self.action_size = self.task.action_size self.state = self.task.reset() self.memory = PrioritizedReplay(buffer_size, batch_size, replay_alpha, beta_limit) self.actor = Actor(self.state_size, self.action_size, self.task.action_low, self.task.action_high) self.actor_weights = self.actor.model.trainable_weights self.actor_target = Actor(self.state_size, self.action_size, self.task.action_low, self.task.action_high) self.critic = Critic(self.state_size, self.action_size) self.critic_weights = self.critic.model.trainable_weights self.critic_target = Critic(self.state_size, self.action_size) self.gamma = gamma # how much influence older weights have when updating target self.tau = 0.03 #noise # GENTLE LANDING #self.mu = 0 #self.theta = 0.1 #self.sigma = 25 self.mu = 0 self.theta = 0.1 self.sigma = 9 self.noise = Noise(self.action_size, self.mu, self.theta, self.sigma) self.episodes = 0 self.training_step = 0
def __init__(self, env, act_dim, state_dim, goal_dim, act_range, buffer_size=int(1e6), gamma=0.98, lr=0.001, tau=0.95): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = state_dim + goal_dim self.gamma = gamma self.lr = lr self.tau = tau self.env = env # Create actor and critic networks self.actor_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_network = Critic(self.env_dim, act_dim, act_range) self.critic_target_network = Critic(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) sync_networks(self.actor_network) sync_networks(self.critic_network) # Optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=lr) # Replay buffer # self.buffer = MemoryBuffer(buffer_size) self.buffer = ReplayMemory(buffer_size) # Normalizers self.goal_normalizer = Normalizer( goal_dim, default_clip_range=5) # Clip between [-5, 5] self.state_normalizer = Normalizer(state_dim, default_clip_range=5)
def __init__(self, env, time_steps, hidden_dim): self.name = 'DDPG' # name for uploading results self.scale = env.asset self.unit = env.unit self.seed = env.rd_seed self.time_dim = time_steps self.state_dim = env.observation_space.shape[1] self.action_dim = env.action_space.shape[0] self.batch_size = 64 self.memory_size = self.time_dim + self.batch_size * 10 self.start_size = self.time_dim + self.batch_size * 2 # Initialise actor & critic networks self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim) self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim) # Initialize replay buffer self.replay_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_next_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_action = torch.zeros( (self.start_size - 1, 1, self.state_dim), device=cuda) self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim) self.initial()
def test_actor(self): Actor_obj = Actor(1, 16, 4) Critic_obj = Critic(4, 16, 1) # actor_optimizer = optim.SGD(Actor_obj.parameters(), lr=0.1, momentum=0.5) # Forward Propagation y = Actor_obj.forward(torch.FloatTensor([1])) self.assertTrue(len(y) == 4)
def test_critic(self): Actor_obj = Actor(1, 16, 4) Critic_obj = Critic(4, 16, 1) # critic_optimizer = optim.SGD(Critic_obj.parameters(), lr=C_learning_rate) y = Actor_obj.forward(torch.FloatTensor([1])) # Forward Propagation y_pred = Critic_obj.forward(y) self.assertTrue(len(y_pred) == 1)
def __init__(self, env=gym.make('Pendulum-v0'), s_dim=2, a_dim=1, gamma=0.99, episodes=100, tau=0.001, buffer_size=1e06, minibatch_size=64, actor_lr=0.001, critic_lr=0.001, save_name='final_weights', render=False): self.save_name = save_name self.render = render self.env = env self.upper_bound = env.action_space.high[0] self.lower_bound = env.action_space.low[0] self.EPISODES = episodes self.MAX_TIME_STEPS = 200 self.s_dim = s_dim self.a_dim = a_dim self.GAMMA = gamma self.TAU = tau self.buffer_size = buffer_size self.minibatch_size = minibatch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.ou_noise = OUNoise(mean=np.zeros(1)) self.actor = Actor(self.s_dim, self.a_dim).model() self.target_actor = Actor(self.s_dim, self.a_dim).model() self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr) self.target_actor.set_weights(self.actor.get_weights()) self.critic = Critic(self.s_dim, self.a_dim).model() self.critic_opt = tf.keras.optimizers.Adam( learning_rate=self.critic_lr) self.target_critic = Critic(self.s_dim, self.a_dim).model() self.target_critic.set_weights(self.critic.get_weights()) self.replay_buffer = ReplayBuffer(self.buffer_size)
def __init__(self, n_state, n_action, a_limit, model_folder=None, memory_size=10000, batch_size=32, tau=0.01, gamma=0.99, var=3.0): # Record the parameters self.n_state = n_state self.n_action = n_action self.a_limit = a_limit self.memory_size = memory_size self.model_folder = model_folder self.batch_size = batch_size self.tau = tau self.gamma = gamma self.var = var # Create the network and related objects self.memory = np.zeros( [self.memory_size, 2 * self.n_state + self.n_action + 1], dtype=np.float32) self.memory_counter = 0 self.eval_actor = Actor(self.n_state, self.n_action, self.a_limit) self.eval_critic = Critic(self.n_state, self.n_action) self.target_actor = Actor(self.n_state, self.n_action, self.a_limit, trainable=False) self.target_critic = Critic(self.n_state, self.n_action, trainable=False) self.actor_optimizer = Adam(self.eval_actor.parameters(), lr=0.001) self.critic_optimizer = Adam(self.eval_critic.parameters(), lr=0.002) self.criterion = nn.MSELoss() # Make sure the parameter of target network is the same as evaluate network self.hardCopy()
def __init__(self, env, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2): self.states = env.observation_space self.state_size = env.observation_space.shape[0] self.actions = env.action_space self.action_size = env.action_space.shape[0] self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1)
def MountainCar(): env = gym.make('MountainCar-v0') env = env.unwrapped env.reset() env.render() n_features = env.observation_space.shape[0] n_actions = env.action_space.n sess = tf.Session() actor = Actor(sess, n_features, n_actions, lr=LR_A) critic = Critic(sess, n_features, lr=LR_C) sess.run(tf.global_variables_initializer()) game = Game(env, actor, critic) game.run_mountain_car()
def main(args): with tf.device(args['device']): # tf tf.set_random_seed(args['rand_seed']) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # env env = gym.make('TestEnv-v0') env.seed(args['rand_seed']) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] concat_dim = 2 batched_s_dim = [None, s_dim, concat_dim] batched_a_dim = [None, a_dim] # agents actor = Actor(sess, args['actor_lr'], args['tau'], args['batch_size'], args['clip_val'], batched_s_dim, batched_a_dim) critic = Critic(sess, args['critic_lr'], args['tau'], args['clip_val'], batched_s_dim, batched_a_dim) # experience exp = Experience(args['buffer_size'], args['batch_size'], args['rand_seed']) # noise actor_noise = ActorNoise(actor.predict, a_dim, noise_type=args['noise_type']) # initialize init = tf.global_variables_initializer() sess.run(init) saver = Model(sess, args['restore_path']) saver.restore_model() # training her = HER(saver, exp, env, actor, critic, actor_noise) if args['mode'] == 'train': her.train(args['gamma'], args['her_k'], args['max_episodes'], args['max_episode_len'], args['replay_len']) else: her.play(args['max_episodes'], args['max_episode_len'])
def __init__(self, env): LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic num_features = env.observation_space.shape[0] # num_features = 14 num_actions = env.action_space.shape[0] self.action_space = env.action_space sess = tf.Session() self.actor = Actor( sess, n_features=num_features, action_bound=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A) self.critic = Critic( sess, n_features=num_features, lr=LR_C ) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer())
def CartPoleAC(): env1 = gym.make('CartPole-v0') # env2 = gym.make('CartPole-v0') env1.seed(10) # env2.seed(2) env1 = env1.unwrapped env1.reset() # env2 = env2.unwrapped # env2.reset() n_features = env1.observation_space.shape[0] n_actions = env1.action_space.n sess = tf.Session() actor = Actor(sess, n_features, n_actions, lr=LR_A) critic = Critic(sess, n_features, lr=LR_C) sess.run(tf.global_variables_initializer()) g = Game(env1, actor, critic) g.run()
import tensorflow as tf from actor_critic import Actor,Critic env = gym.make('CartPole-v0') env.seed(1) env = env.unwrapped N_S = env.observation_space.shape[0] N_A = env.action_space.n DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold RENDER = False # rendering wastes time sess = tf.Session() actor = Actor(s_dim=N_S,a_dim=N_A,learning_rate=0.01,sess=sess) critic = Critic(s_dim=N_S,learning_rate=0.05,reward_decay=0.9,sess=sess) sess.run(tf.global_variables_initializer()) for i_episode in range(3000): s = env.reset() t = 0 track_r = [] while True: if RENDER:env.render() a = actor.choose_action(s) s_,r,done,info = env.step(a)
EPISODES = 5000 GAMMA = 0.98 ALPHA = 0.005 EPSILON = 0.5 EPSILON_DECAY = 0.1 env = gym.make('Pendulum-v0') # env = Pendulum() a_dim = env.action_space.shape[0] layer_size = [32, 32] s_dim = env.observation_space.shape[0] ddpg = DDPG(env, s_dim=s_dim, a_dim=a_dim) # actor_trained.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error') # x, y = np.random.rand(2, 3), np.array([0.8, 0.4]) # actor_trained.train_on_batch(x, y) actor_trained = Actor(s_dim, a_dim).model() actor_trained.load_weights('training/target_actor_weights') # actor_untrained = ddpg.actor print('hi') def collect_data(act_net): a_all, states_all = [], [] obs = env.reset() for t in range(1000): obs = np.squeeze(obs) if obs.ndim == 1: a = act_net(obs[None, :]) else: a = act_net(obs) obs, _, done, _ = env.step(a) states_all.append(obs)
+'_d'+str(config.lr_decay_step)+'_'+str(config.lr_decay_rate) \ + '_T'+str(config.temperature)+ '_steps'+str(config.nb_steps)+'_i'+str(config.init_B) print(dir_) ###################################### TEST ################################# config.is_training = False config.batch_size = 500 ##### ##### #config.max_length = 50 ##### ##### config.temperature = 1.2 ##### ##### tf.reset_default_graph() actor = Actor(config) # Build graph variables_to_save = [v for v in tf.global_variables() if 'Adam' not in v.name] # Save & restore all the variables. saver = tf.train.Saver(var_list=variables_to_save, keep_checkpoint_every_n_hours=1.0) with tf.Session() as sess: # start session sess.run(tf.global_variables_initializer()) # Run initialize op save_path = "save/"+dir_ predictions_length, predictions_length_w2opt, time_mmodel, time_l2opt = [], [], [], [] pred_all_2opt, time_all_2opt = [], [] for i in tqdm(range(1000)): # test instance seed_ = 1+i
def test_policy(output_dir, env_name, episodes, checkpoint_number): """ Run a learned policy with visualisation in the environment. Args: output_dir: str. Directory containing a JSON file named 'config.json' with experiment metadata, and a subfolder named 'training_checkpoints' containing model checkpoints. env_name: str. Name of the environment to run the policy in. episodes: int. Number of episodes to run the policy for. """ # Load experimental metadata from file with open(os.path.join(output_dir, 'config.json'), 'r') as f: exp_data = json.load(f) # Create environment env = gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Share information about action space with policy architecture ac_kwargs = dict(hidden_sizes=exp_data['ac_kwargs']['hidden_sizes']) ac_kwargs['action_space'] = env.action_space # Randomly initialise critic and actor networks critic = Critic(input_shape=(exp_data['batch_size'], obs_dim + act_dim), **ac_kwargs) actor = Actor(input_shape=(exp_data['batch_size'], obs_dim), **ac_kwargs) # Optimizers critic_optimizer = tf.keras.optimizers.Adam(exp_data['q_lr']) actor_optimizer = tf.keras.optimizers.Adam(exp_data['pi_lr']) checkpoint_dir = os.path.join(output_dir, 'training_checkpoints') checkpoint = tf.train.Checkpoint(critic_optimizer=critic_optimizer, actor_optimizer=actor_optimizer, critic=critic, actor=actor) if checkpoint_number is not None: checkpoint.restore( os.path.join(checkpoint_dir, f'ckpt-{checkpoint_number}')).expect_partial() else: checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() # Run policy for specified number of episodes, recording return ep_rets = np.zeros(episodes) for i in range(episodes): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not (d or (ep_len == exp_data['max_ep_len'])): env.render() o, r, d, _ = env.step(actor(o.reshape(1, -1))) if type(r) == np.ndarray: r = r[0] ep_ret += r ep_len += 1 ep_rets[i] = ep_ret print(f'Episode {i}: return={ep_ret:.0f} length={ep_len}') # Summary stats print(f'avg={ep_rets.mean():.0f} std={ep_rets.std():.0f} ' \ f'min={ep_rets.min():.0f} max={ep_rets.max():.0f}') env.close()
from job_generator import JobGenerator from element import Machine, Job import plot import tensorflow as tf import numpy as np from actor_critic import Actor, Critic import os LOG_DIR = "./log" LOG_FILE = "log_rl" MODEL_DIR = "./model" if __name__ == '__main__': sess = tf.Session() pa = Parameter() actor = Actor(sess, pa) critic = Critic(sess, pa) sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(LOG_DIR, sess.graph) saver = tf.train.Saver() logger = open(LOG_FILE, "w") # file to record the logs if not os.path.exists(MODEL_DIR): os.mkdir(MODEL_DIR) Machine.reset() Job.reset() env = Environment(pa) mac_gen = MacGenerator(pa) job_gen = JobGenerator(pa)
return inputs if __name__ == '__main__': # load the model param model_path = 'saved_models/%s/%s/model.pt' % (args.env, args.her_strat) o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage) # create the environment env = gym.make(args.env) # get the env param obs = env.reset() # get the environment params act_dim = env.action_space.shape[0] env_dim = obs['observation'].shape[0] + obs['desired_goal'].shape[0] act_range = env.action_space.high[0] # create the actor network actor_network = Actor(env_dim, act_dim, act_range) actor_network.load_state_dict(model) actor_network.eval() for i in range(DEMO_LENGHT): observation = env.reset() # start to do the demo obs = observation['observation'] g = observation['desired_goal'] for t in range(env._max_episode_steps): env.render() inputs = process_inputs(obs, g, o_mean, o_std, g_mean, g_std) with torch.no_grad(): pi = actor_network(inputs) action = pi.detach().numpy().squeeze() # put actions into the environment observation_new, reward, done, info = env.step(action)
def train(self, max_episode=10, max_path_length=200, verbose=0): env = self.env avg_reward_sum = 0. #f_eps = open("episode.csv","w") #write_eps = csv.write(f_eps) for e in range(max_episode): env._reset() observation = env._reset() game_over = False reward_sum = 0 inputs = [] outputs = [] predicteds = [] rewards = [] #f_iter = open("episode_{0}.csv".format(e),"w") #write_iter = csv.writer(f_iter) f_episode = "episode_{0}.csv".format(e) os.system("rm -rf {0}".format(f_episode)) print(observation[0].shape, observation[1].shape) sess = tf.Session() actor = Actor(sess, n_actions=self.env.action_space.n # output_graph=True, ) critic = Critic( sess, n_actions=self.env.action_space.n ) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer()) while not game_over: action, aprob = actor.choose_action(observation) inputs.append(observation) predicteds.append(aprob) y = np.zeros([self.env.action_space.n]) y[action] = 1. outputs.append(y) observation_, reward, actual_reward, game_over, info = self.env._step( action) reward_sum += float(actual_reward) print(reward) #rewards.append(float(reward)) rewards.append(float(reward)) # After env.step td_error = critic.learn( observation, reward_sum, observation_) # gradient = grad[r + gamma * V(s_) - V(s)] actor.learn( observation, action, td_error) # true_gradient = grad[logPi(s,a) * td_error] # check memory for RNN model if len(inputs) > self.max_memory: del inputs[0] del outputs[0] del predicteds[0] del rewards[0] if verbose > 0: if env.actions[action] == "LONG" or env.actions[ action] == "SHORT": #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD": color = bcolors.FAIL if env.actions[ action] == "LONG" else bcolors.OKBLUE print("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ]))) #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())]))) os.system("echo %s >> %s" % ("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ])), f_episode)) avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % ( e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum) print(toPrint) if self.history_filename != None: os.system("echo %s >> %s" % (toPrint, self.history_filename)) dim = len(inputs[0]) inputs_ = [[] for i in range(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [np.array(inputs_[i]) for i in range(dim)] outputs_ = np.vstack(outputs) predicteds_ = np.vstack(predicteds) rewards_ = np.vstack(rewards) print("shape: ", np.shape(rewards)) print("fit model input.shape %s, output.shape %s" % ([inputs_[i].shape for i in range(len(inputs_))], outputs_.shape)) np.set_printoptions(linewidth=200, suppress=True) print("currentTargetIndex:", env.currentTargetIndex)
MAX_EPISODE = 1000 MAX_EP_STEPS = 2000 LR_A = 0.001 LR_C = 0.01 env = gym.make('MountainCar-v0') env = env.unwrapped env.seed(1) n_features = env.observation_space.shape[0] n_actions = env.action_space.n sess0 = tf.Session() sess1 = tf.Session() rl = [[ Actor(sess0, n_features, n_actions, name='actor0', lr=LR_A), Critic(sess0, n_features, name='critic0', lr=LR_C) ], [ Actor(sess1, n_features, n_actions, name='actor1', lr=LR_A), Critic(sess1, n_features, name='critic1', lr=LR_C) ]] sess0.run(tf.global_variables_initializer()) sess1.run(tf.global_variables_initializer()) episode_positive = [] episode_negative = [] episode_mix = [] for episode in range(MAX_EPISODE): for i in range(len(rl)): step = 0
def ddpg(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), discount=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Implements the deep deterministic policy gradient algorithm. Performance statistics are logged to stdout and to file in CSV format, and models are saved regularly during training. Args: env_fn: callable. Must load an instance of an environment that implements the OpenAI Gym API. ac_kwargs: dict. Additional keyword arguments to be passed to the Actor and Critic constructors. seed: int. Random seed. steps_per_epoch: int. Number of training steps or environment interactions that make up one epoch. epochs: int. Number of epochs for training. replay_size: int. Maximum number of transitions that can be stored in the replay buffer. discount: float. Rate of discounting on future reward, usually denoted with the Greek letter gamma. Normally between 0 and 1. polyak: float. Weighting of target estimator parameters in the target update (which is a "polayk" average). pi_lr: float. Learning rate for the policy or actor estimator. q_lr: float. Learning rate for the Q or critic estimator. batch_size: int. Number of transitions to sample from the replay buffer per gradient update of the estimators. start_steps: int. Number of initial training steps where actions are chosen at random instead of the policy, as a means of increasing exploration. act_noise: float. Scale (standard deviation) of the Gaussian noise added to the policy for exploration during training. max_ep_len: int. Maximum number of steps for one episode in the environment. Episode length may be shorter if there are terminal states. logger_kwargs: dict. Keyword arguments to be passed to the logger. Can be set up using utils.setup_logger_kwargs(). save_freq: int. Models are saved per this number of epochs. """ # Set up logging logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Set random seed for relevant modules tf.random.set_seed(seed) np.random.seed(seed) # Create environment env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] if env._max_episode_steps < max_ep_len: max_ep_len = env._max_episode_steps if steps_per_epoch % max_ep_len != 0: """ Training steps are batched at the end of a trajectory, so if episode length does not divide steps per epoch, the size of training step log arrays can be inconsistent. This takes the upper bound on size, which wastes some memory but is easy. """ max_logger_steps = steps_per_epoch + max_ep_len - (steps_per_epoch % max_ep_len) else: max_logger_steps = steps_per_epoch # Action limit for clipping # Assumes all dimensions have the same limit act_limit = env.action_space.high[0] # Give actor-critic model access to action space ac_kwargs['action_space'] = env.action_space # Randomly initialise critic and actor networks critic = Critic(input_shape=(batch_size, obs_dim + act_dim), lr=q_lr, **ac_kwargs) actor = Actor(input_shape=(batch_size, obs_dim), lr=pi_lr, **ac_kwargs) # Initialise target networks with the same weights as main networks critic_target = Critic(input_shape=(batch_size, obs_dim + act_dim), **ac_kwargs) actor_target = Actor(input_shape=(batch_size, obs_dim), **ac_kwargs) critic_target.set_weights(critic.get_weights()) actor_target.set_weights(actor.get_weights()) # Initialise replay buffer for storing and getting batches of transitions replay_buffer = ReplayBuffer(obs_dim, act_dim, size=replay_size) # Set up model checkpointing so we can resume training or test separately checkpoint_dir = os.path.join(logger.output_dir, 'training_checkpoints') checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(critic=critic, actor=actor) def get_action(o, noise_scale): """ Computes an action from the policy (as a function of the observation `o`) with added noise (scaled by `noise_scale`), clipped within the bounds of the action space. """ a = actor(o.reshape(1, -1)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) @tf.function def train_step(batch): """ Performs a gradient update on the actor and critic estimators from the given batch of transitions. Args: batch: dict. A batch of transitions. Must store valid values for 'obs1', 'acts', 'obs2', 'rwds', and 'done'. Obtained from ReplayBuffer.sample_batch(). Returns: A tuple of the Q values, critic loss, and actor loss. """ with tf.GradientTape(persistent=True) as tape: # Critic loss q = critic(batch['obs1'], batch['acts']) q_pi_targ = critic_target(batch['obs2'], actor_target(batch['obs2'])) backup = tf.stop_gradient(batch['rwds'] + discount * (1 - batch['done']) * q_pi_targ) q_loss = tf.reduce_mean((q - backup)**2) # Actor loss pi = actor(batch['obs1']) q_pi = critic(batch['obs1'], pi) pi_loss = -tf.reduce_mean(q_pi) # Q learning update critic_gradients = tape.gradient(q_loss, critic.trainable_variables) critic.optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables)) # Policy update actor_gradients = tape.gradient(pi_loss, actor.trainable_variables) actor.optimizer.apply_gradients(zip(actor_gradients, actor.trainable_variables)) return q, q_loss, pi_loss def test_agent(n=10): """ Evaluates the deterministic (noise-free) policy with a sample of `n` trajectories. """ for _ in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(n, TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): """ Start with `start_steps` number of steps with random actions, to improve exploration. Then use the learned policy with some noise added to keep up exploration (but less so). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Execute a step in the environment o2, r, d, _ = env.step(a) o2 = np.squeeze(o2) # bug fix for Pendulum-v0 environment, where act_dim == 1 ep_ret += r ep_len += 1 """ Ignore the "done" signal if it comes from hitting the time horizon (that is, when it's an artificial terminal signal that isn't based on the agent's state) """ d = False if ep_len==max_ep_len else d # Store transition in replay buffer replay_buffer.store(o, a, r, o2, d) # Advance the stored state o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) # Actor-critic update q, q_loss, pi_loss = train_step(batch) logger.store((max_logger_steps, batch_size), QVals=q.numpy()) logger.store(max_logger_steps, LossQ=q_loss.numpy(), LossPi=pi_loss.numpy()) # Target update critic_target.polyak_update(critic, polyak) actor_target.polyak_update(actor, polyak) logger.store(max_logger_steps // max_ep_len, EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Post-training for this epoch: save, test and write logs if t > 0 and (t+1) % steps_per_epoch == 0: epoch = (t+1) // steps_per_epoch # Save the model if (epoch % save_freq == 0) or (epoch == epochs - 1): checkpoint.save(file_prefix=checkpoint_prefix) # Test the performance of the deterministic policy test_agent() # Log info about the epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t+1) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()