def __init__(self, env_name='Hopper-v2', total_episodes=1000, learning_steps=1000, gpu=0, update_time=1, gamma=1, episode_length=1000, total_steps=int(1e6), lr=1e-3, action_bound=1, num_samples=10, noise=0.02, std_dev=0.03, batch_size=100, elite_percentage=0.2, mutate=0.9, crossover=0.2, hidden_size=64, seed=1, namescope='default'): self.env = gym.make(env_name) np.random.seed(seed) self.env.seed(seed) tf.set_random_seed(seed) self.input_size = self.env.observation_space.shape[0] self.output_size = self.env.action_space.shape[0] self.total_episodes = total_episodes self.episode_length = episode_length self.total_steps = total_steps self.update_time = update_time self.lr = lr self.gamma = gamma self.action_bound = action_bound self.num_samples = num_samples self.noise = noise self.stddev = std_dev self.batch_size = batch_size self.elite_percentage = elite_percentage self.mutate = mutate self.crossover = crossover self.hidden_size = hidden_size self.normalizer = utils.Normalizer(self.input_size) self.batch_size = batch_size self.namescope = namescope # config = tf.ConfigProto(device_count={'GPU': gpu}) self.learning_steps = learning_steps self.td3_agent = ddpg.DDPG(self.input_size, self.output_size, 1, namescope=self.namescope, hidden_size=hidden_size, seed=seed)
def run(total_eps=2, message=False, render=False, map_path="Maps/map.png",\ model_path="save/", gif_path="out/", gif_name="test.gif"): if not os.path.exists(gif_path): os.makedirs(gif_path) images = [] RL = ddpg.DDPG( model = [models.PolicyNet, models.QNet], learning_rate = [0.0001, 0.0001], reward_decay = 0.99, memory_size = 10000, batch_size = 64) RL.save_load_model("load", model_path) env = NavigationEnv(path=map_path) for eps in range(total_eps): step = 0 max_success_rate = 0 success_count = 0 state = env.initialize() r_eps = [] acc_reward = 0. while(True): # Choose action and run action = RL.choose_action(state, eval=True) state_next, reward, done = env.step(action) im = env.render(gui=render) im_pil = Image.fromarray(cv2.cvtColor(np.uint8(im*255),cv2.COLOR_BGR2RGB)) images.append(im_pil) # Record and print information r_eps.append(reward) acc_reward += reward if message: print('\rEps: {:2d}| Step: {:4d} | action:{:+.2f}| R:{:+.2f}| Reps:{:.2f} '\ .format(eps, step, action[0], reward, acc_reward), end='') state = state_next.copy() step += 1 if done or step>600: if message: print() break print("Save evaluation GIF ...") images[0].save(gif_path+gif_name, save_all=True, append_images=images[1:], optimize=True, duration=40, loop=0)
def main(arglist): env = gym.make(arglist.scenario) writer = SummaryWriter(log_dir='./logs/') critic = agent.Critic(env.observation_space.shape[0], env.action_space.shape[0]).to(device) actor = agent.Actor(env.observation_space.shape[0], 2).to(device) target_critic = agent.Critic(env.observation_space.shape[0], env.action_space.shape[0], arglist.tau).to(device) target_actor = agent.Actor(env.observation_space.shape[0], 2, arglist.tau).to(device) actor.eval() critic.eval() target_actor.eval() target_critic.eval() ddpg_algo = ddpg.DDPG(actor, critic, target_actor, target_critic, arglist.gamma, arglist.batch_size, arglist.eval) ddpg_algo.load('./saved/actor_' + str(arglist.load_episode_saved), './saved/critic_' + str(arglist.load_episode_saved)) for episode in range(arglist.max_episode): obs = env.reset() done = False j = 0 ep_ave_max_q_value = 0 total_reward = 0 while not done: if not arglist.eval: env.render() action = ddpg_algo.act(obs) obs2, reward, done, info = env.step(action) total_reward += reward if arglist.eval: ep_ave_max_q_value += ddpg_algo.train(action, [reward], obs, obs2, [done]) obs = obs2 j += 1 if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0: critic.save_model('./saved/critic_' + str(episode)) actor.save_model('./saved/actor_' + str(episode)) if arglist.eval: print('average_max_q: ', ep_ave_max_q_value / float(j), 'reward: ', total_reward, 'episode:', episode) writer.add_scalar('Average_max_q', ep_ave_max_q_value / float(j), episode) writer.add_scalar('Reward', total_reward, episode) env.close()
def __init__(self, env_name='Hopper-v2', total_episodes=1000, action_bound=1, episode_length=1000, learning_rate=0.02, weight=0.01, learning_steps=100, num_samples=8, noise=0.02, bc_index=[], std_dev=0.03, syn_step=1, meta_population_size=5, seed=1, hidden_size=300): self.env = gym.make(env_name) np.random.seed(seed) self.env.seed(seed) self.action_bound = action_bound self.input_size = self.env.observation_space.shape[0] self.output_size = self.env.action_space.shape[0] self.total_episodes = total_episodes self.episode_length = episode_length self.lr = learning_rate self.num_samples = num_samples self.noise = noise self.meta_population_size = meta_population_size self.seed = seed self.syn_step = syn_step self.learning_steps = learning_steps self.bc_index = bc_index self.weight = weight self.normalizer = utils.Normalizer(self.env.observation_space.shape[0]) self.hidden_size = hidden_size self.stddev = std_dev self.td3_agent = ddpg.DDPG(self.input_size, self.output_size, 1, hidden_size=self.hidden_size, seed=seed) self.num_best_deltas = 4
def create_agents(self, env, arglist): #workers = [] algo_mode = self._algo_mode_from_agents(env) obs_shapes = [env.get_env().observation_space[i].shape for i in range(env.get_env().n)] actions_shape_n = [env.get_env().action_space[i].n for i in range(env.get_env().n)] actions_n = 0 obs_shape_n = 0 for actions in actions_shape_n: actions_n += actions for obs_shape in obs_shapes: obs_shape_n += obs_shape[0] for i, action_space, observation_space, algo in zip(range(len(env.get_env().action_space)), env.get_env().action_space, env.get_env().observation_space, algo_mode): if isinstance(action_space, Box): discrete_action = False else: discrete_action = True if algo == ddpg.MADDPG: print('MADDPG load.') critic = agent.Critic(obs_shape_n, actions_n).to(device) actor = agent.Actor(observation_space.shape[0], action_space.n).to(device) target_critic = agent.Critic(obs_shape_n, actions_n, arglist.tau).to(device) target_actor = agent.Actor(observation_space.shape[0], action_space.n, arglist.tau).to(device) else: print('DDPG load.') critic = agent.Critic(observation_space.shape[0], action_space.n).to(device) actor = agent.Actor(observation_space.shape[0], action_space.n).to(device) target_critic = agent.Critic(observation_space.shape[0], action_space.n, arglist.tau).to(device) target_actor = agent.Actor(observation_space.shape[0], action_space.n, arglist.tau).to(device) actor.eval() critic.eval() target_actor.eval() target_critic.eval() ddpg_algo = ddpg.DDPG(i, actor, critic, target_actor, target_critic, arglist.gamma, arglist.batch_size, arglist.eval, discrete_action, alg_mode=algo) ddpg_algo.load('./saved/actor' + str(i) + '_' + str(arglist.load_episode_saved), './saved/critic' + str(i) + '_' + str(arglist.load_episode_saved)) self.workers.append(ddpg_algo)
def __init__(self, numChans, states, numSteps): self.actions = np.zeros((numChans + 1, numChans)) for k in range(0, numChans): self.actions[k + 1, k] = 1 self.numChans = numChans self.numActions = np.shape(self.actions)[0] self.actionTally = np.zeros(numChans + 1) self.actionHist = np.zeros((numSteps, numChans)) self.actionHistInd = np.zeros(numSteps) self.goodChans = np.ones(numChans) self.states = states self.numStates = np.shape(states)[0] self.stateHist = np.zeros((numSteps, numChans)) self.stateTally = np.zeros(self.numStates) self.rewardHist = np.zeros(numSteps) self.rewardTally = np.zeros(numChans + 1) self.cumulativeReward = np.zeros(numSteps) self.rewardTrans = np.zeros( (self.numActions, self.numStates, self.numStates)) self.exploreHist = [] self.policy = np.zeros(numChans) self.n_actions = numChans + 1 self.n_features = numChans self.ddpg_ = ddpg.DDPG(self, self.n_actions, self.n_features, self.n_actions + 1) self.type = "ddpg" self.hyperType = "learning" self.var = 1
def __init__(self): # Create the environment and render self._env = Env(True) # Extract the dimesions of states and actions observation_dim = self._env.observation_space.low.size action_dim = self._env.action_space.low.size self._device = 'cpu' # Uncomment if you do trianing on the GPU # self._device = 'cuda:0' hidden_sizes = [256] * 2 self._q_net = networks.QvalueNetwork( hidden_sizes=hidden_sizes, input_size=observation_dim + action_dim).to(device=self._device) self._target_q_net = networks.QvalueNetwork( hidden_sizes=hidden_sizes, input_size=observation_dim + action_dim).to(device=self._device) self._policy_net = networks.PolicyNetwork( hidden_sizes=hidden_sizes, input_size=observation_dim, output_size=action_dim).to(device=self._device) self._target_policy_net = networks.PolicyNetwork( hidden_sizes=hidden_sizes, input_size=observation_dim, output_size=action_dim).to(device=self._device) # Target update rate tau = 0.001 # Set to true if you want to slow down the simulator self._slow_simulation = False # Create the ddpg agent self.agent = ddpg.DDPG(q_net=self._q_net, target_q_net=self._target_q_net, policy_net=self._policy_net, target_policy_net=self._target_policy_net, tau=tau, device=self._device) # Create a replay buffer - we use here one from a popular framework self._replay = replay.SimpleReplayBuffer( max_replay_buffer_size=1000000, observation_dim=observation_dim, action_dim=action_dim, env_info_sizes={}, ) # Stores the cummulative rewards self._rewards = [] self._rewards_test = [] # The following logging works only on linux at the moment - might cause issues if you use windows folder = 'experiment_data_test_runs' #generate random hash string - unique identifier if we start # multiple experiments at the same time rand_id = hashlib.md5(os.urandom(128)).hexdigest()[:8] self._file_path = './' + folder + '/' + time.ctime().replace( ' ', '_') + '__' + rand_id # Create experiment folder if not os.path.exists(self._file_path): os.makedirs(self._file_path)
import gym import ddpg test = False ddpg_alg = ddpg.DDPG( env_fn=lambda: gym.make("HalfCheetah-v2"), pi_hidden_sizes=(64, ), q_hidden_sizes=(64, ), gamma=0.99, rho=0.995, action_noise=0.1, replay_buffer_size=3 if test else 1_000_000, log_dir="/Users/harrygiles/tmp/my-rl/ddpg", exploration_period=10_000, ) ddpg_alg.train( epochs=2 if test else 50, epoch_size=10 if test else 5_000, batch_size=10 if test else 100, max_episode_length=1_000, )
from nav_environment import NavigationEnv import ddpg import models import numpy as np import os import eval_ddpg batch_size = 64 eval_eps = 50 RL = ddpg.DDPG( model = [models.PolicyNet, models.QNet], learning_rate = [0.0001, 0.0001], reward_decay = 0.99, memory_size = 10000, batch_size = batch_size) is_train = True render = True load_model = False ''' is_train = False render = True load_model = True ''' gif_path = "out/" model_path = "save/" if not os.path.exists(model_path): os.makedirs(model_path) if load_model: print("Load model ...", model_path)
import matplotlib.pyplot as plt import json import cv2 import models #%% env = GSlamContBot2DWrapper.Bot2DEnv(obs_size=128, grid_size=3, map_path="Image/map9.png", task="Exploration") memory_size = 1000 RL = ddpg.DDPG( actor_net = models.ActorExp, critic_net = models.CriticExp, n_actions = 2, learning_rate = [0.0001, 0.0002], reward_decay = 0.95, memory_size = memory_size, batch_size = 64, var = 2, var_decay = 0.9999,) #%% seq_size = 3 if __name__ == '__main__': total_step = 0 reward_rec= [] for eps in range(1000): state = env.reset() state_m = cv2.resize(state["map"], (64,64), interpolation=cv2.INTER_LINEAR) state_m = np.tile(np.expand_dims(state_m,-1),(1,1,seq_size))
if __name__ == "__main__": env = gym.make(ENV) with tf.Session() as sess: training = None if "-n" in sys.argv: training = True else: training = False actor = ddpg.DDPG(3, 1, memory=0.99, actor_lr=0.001, critic_lr=0.001, tau=0.1, exp_batch=1024, training=training) saver = tf.train.Saver() if "-n" in sys.argv: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, "model/pendelum") print("Restored...") try: if "-p" in sys.argv: print("Playing...") gym_wrapper.play(env, actor, a_mod=action_modifier)
def train_PG( exp_name, env_name, n_iters, gamma, min_timesteps_per_batch, max_path_length, lr, normalize_advantages, nn_baseline, seed, n_layers, hidden_size, discrete, logdir, method, method_args): start = time.time() # env # env = gym.make(env_name) #TODO: env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \ timeout=5, realworkercount=4) env.state_size = 1 env.action_size = 2 # set up logger setup_logger(logdir, locals()) # random seeds torch.manual_seed(seed) np.random.seed(seed) if hasattr(env, 'seed'): env.seed(seed) # sete attributes if isinstance(env, gym.Env): max_path_length = max_path_length or env.spec.max_episode_steps discrete = isinstance(env.action_space, gym.spaces.Discrete) state_size = env.observation_space.shape[0] action_size = env.action_space.n if discrete else env.action_space.shape[0] else: if hasattr(env, 'state_size'): state_size = env.state_size else: raise Exception("Environment has attribute state_size or use gym.Env!") if hasattr(env, 'action_size'): action_size = env.action_size else: raise Exception("Environment has attribute action_size or use gym.Env!") net_args = { "n_layers": n_layers, "state_size": state_size, "action_size": action_size, "discrete": discrete, "hidden_size": hidden_size, "learing_rate": lr, "output_activation": None } trajectory_args = { "max_path_length": max_path_length, "min_timesteps_per_batch": min_timesteps_per_batch } reward_args = { "gamma": gamma, "nn_baseline": nn_baseline, "normalize_advantage": normalize_advantages } if method == "sac": agent = sac.SAC(net_args, trajectory_args, reward_args, method_args) elif method == "ddpg": agent = ddpg.DDPG(net_args, trajectory_args, reward_args, method_args) elif method == "vpg": agent = Agent(net_args, trajectory_args, reward_args) # create networks agent.build_net() total_timesteps = 0 for it in range(n_iters): print("=============Iteration {}==============".format(it)) paths, timesteps_this_batch = agent.sample_trajectories(it, env) #TODO: env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \ timeout=5, realworkercount=4) total_timesteps += timesteps_this_batch states = np.concatenate([path["state"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = [path["reward"] for path in paths] # next_states = np.concatenate([path["next_state"] for path in paths]) states_input = torch.Tensor(states).float() actions_input = torch.Tensor(actions).float() if method == "vpg": q_n, adv = agent.estimate_return(states_input, rewards) agent.train_op(states_input, actions_input, q_n, adv) else: agent.train_op() returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] best_idx = np.argmax(returns) best_path = paths[best_idx] best_policy = {} for i in range(5): best_policy[str(i+1)] = best_path["action"][i].tolist() data = {"method": method, "best_policy": [best_policy], "best_reward": returns[best_idx]} data = pd.DataFrame(data) if os.path.exists("best_policy_pg.csv"): policy_df = pd.read_csv("best_policy_pg.csv") policy_df.loc[len(policy_df)] = [method, best_policy, returns[best_idx]] else: policy_df = data policy_df.to_csv("best_policy_pg.csv", index=False) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", it) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular()
epochs = 1000 steps = 1000 updateTargetNetwork = 5000 explorationRate = 0 minibatch_size = 32 learnStart = 32 learningRate = 0.0001 discountFactor = 0.99 memorySize = 1000000 network_inputs = 12 network_outputs = 3 network_structure = [30, 30] current_epoch = 0 max_margin = 2 ddpg = ddpg.DDPG(network_inputs, network_outputs, memorySize, learningRate, discountFactor, learnStart, max_margin) ddpg.init_net_works(network_structure) restart_cnt = 0 stepCounter = 0 for epoch in range(1, epochs + 1): restart_cnt += 1 random_des = random.randint(0, 3) env.env.set_des(des_list[random_des]) print ('set des: ' + str(des_list[random_des])) observation = env.reset() done = False episode_step = 0
if __name__ == '__main__': env = gym.make(env_name) env.seed(0) random.seed(0) np.random.seed(0) # Make a directory to store the learned policies dirname = datetime.datetime.now().isoformat() os.mkdir(dirname) replay_buffer = replay_buffer.ReplayBuffer(buffer_size) sample_batch = replay_buffer.get_batch ddpg = ddpg.DDPG(env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode) for epoch in range(n_epoch): print("Start training epoch", epoch) for cycle in range(n_cycles): for episode in range(n_episode): state = env.reset() state = np.concatenate((state['observation'], state['achieved_goal'], state['desired_goal'])) tot_reward = 0 ddpg.reset_noise() for step in range(env.spec.timestep_limit): if random.random() < 0.2: action = env.action_space.sample() else: action = ddpg.noise_action(state) obs, reward, done, info = env.step(action)