def main(): """ DDPG run """ en_nm = 'InvertedPendulum-v2' env = gym.make(en_nm) test_env = gym.make(en_nm) ac_kwargs = {'hidden_sizes': [256, 256], 'actor_critic': MLPActorCritic} agent_args = {'env_name': 'HCv2'} train_args = { 'eval_episodes': 5, 'seed': 0, 'save_frequency': 120, 'load_model': False, 'device': 'cpu', 'max_eps_len': 150, 'test_env': test_env, 'evaluate_agent': False, 'q_lr': 1e-4, 'pi_lr': 1e-4, 'exploration_steps': 10000, 'steps_per_epoch': 1000, 'batch_size': 128 } args = {'ac_kwargs': ac_kwargs, **agent_args, **train_args} ddpg(env, **args)
def launch(args): # create the ddpg_agent # env = gym.make(args.env_name) rend = False discreteAction = 0 numControlledJoints = 9 actionRepeat = 1 fixed = False maxStep = 1000 # env = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, # numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True) env = kukaPickGymEnvHer(urdfRoot=robot_data.getDataPath(), maxSteps=maxStep, renders=rend, useIK=0, isDiscrete=discreteAction, actionRepeat=actionRepeat, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=1) # get the environment parameters env_params = get_env_params(env) # create the ddpg agent to interact with the environment args.replay_strategy = 'normal' ddpg_trainer = ddpg(args, env, env_params) ddpg_trainer.learn()
def launch(args): # create the ddpg_agent # env = gym.make(args.env_name) rend = False discreteAction = 0 numControlledJoints = 6 fixed = False actionRepeat = 1 reward_type = args.reward_type if args.env_name == 'reach' or args.env_name == 'Reach': env = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(),actionRepeat=actionRepeat,renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=reward_type) elif args.env_name == 'push' or args.env_name == 'Push': env = kukaPushGymEnvHer(urdfRoot=robot_data.getDataPath(),actionRepeat=actionRepeat,renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=reward_type) elif args.env_name == 'reachob' or args.env_name == 'Reachob': env = kukaReachGymEnvOb(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=reward_type) else: env = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(),actionRepeat=actionRepeat,renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True, reward_type=reward_type) # get the environment parameters env_params = get_env_params(env, actionRepeat) # create the ddpg agent to interact with the environment ddpg_trainer = ddpg(args, env, env_params) ddpg_trainer.learn()
def __init__(self, action_size = 2, buffer_size = buffer_size , n_agents = 2 ,\ batch_size = batch_size , seed = 2, update_every = 1 , gamma = 1): self.madagents = [ ddpg.ddpg(24, 2, 256, 128, 64), ddpg.ddpg(24, 2, 256, 128, 64) ] self.update_every = update_every self.batch_size = batch_size self.buffer_size = buffer_size self.memory = buffer.ReplayBuffer(action_size, buffer_size, batch_size, seed=2) #self.t_step = 0 self.n_agents = n_agents self.gamma = gamma
def train(): agent, scores = ddpg() plt.plot(np.arange(1, len(scores) + 1), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() return agent
def main(trainable=True): env = gym_carla_car_following("127.0.0.1", 2000, 15) agent = ddpg(env.observation_space.shape[0], env.action_space.shape[0], trainable) try: agent.load() except: traceback.print_exc() while True: try: interactive_with_environment(agent, env) except: traceback.print_exc() finally: agent.save(-1)
parser.add_argument('env_name') parser.add_argument('--exp_name', default=None) parser.add_argument('--exp_variant', default=None) parser.add_argument('--logdir', default='out') parser.add_argument('--seeds', type=int, default=0, nargs='*') parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--steps_per_epoch', type=int, default=1000) parser.add_argument('--discount', type=float, default=.99) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--polyak', type=float, default=0.001) parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--exploration_steps', type=int, default=0) parser.add_argument('--rand_proc', default='ou') parser.add_argument('--rand_proc_kwargs', type=json.loads, default=dict()) args = parser.parse_args() seeds = args.seeds if isinstance(args.seeds, list) else [args.seeds] rand_proc_dir = {'normal': core.NormalProcess, 'ou': core.OrnsteinUhlenbeckProcess} rand_proc = rand_proc_dir[args.rand_proc] for seed in seeds: print("\nNEW EXPERIMENT: SEED {}\n".format(seed)) ddpg(env_name=args.env_name, exp_name=args.exp_name, exp_variant=args.exp_variant, logdir='out', seed=seed, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, batch_size=args.batch_size, discount=args.discount, polyak=args.polyak, weight_decay=args.weight_decay, exploration_steps=args.exploration_steps, rand_proc=rand_proc, rand_proc_kwargs=args.rand_proc_kwargs)
def main(): # Creating necessary directories collect_track_no = 5 experiment_name = "tensorboard-4" experiment_dir = "experiment-%s/" % experiment_name models_dir = experiment_dir + "model/" datas_dir = experiment_dir + "datas-track-no-%d/" % collect_track_no if os.path.exists(experiment_dir) == False: print("%s dosen't exists" % experiment_dir) return if os.path.exists(models_dir) == False: print("%s dosen't exists" % models_dir) return if os.path.exists(datas_dir) == False: os.mkdir(datas_dir) action_dim = 1 state_dim = 30 env_name = 'torcs' sess = tf.InteractiveSession() agent = ddpg(env_name, sess, state_dim, action_dim, models_dir) agent.load_network() vision = True env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=collect_track_no, random_track=False, track_range=(0, 3)) print("Collecting Start.") max_data_entry_count = 2000 data_entry_count = 0 start_time = time.time() i = 0 step = 0 try: file = open(datas_dir + 'state-action-scalar', 'w') while data_entry_count < max_data_entry_count: if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, 0.0)) pre_a_t = 0.0 while data_entry_count < max_data_entry_count: a_t = agent.action(s_t) ob, r_t, done, info = env.step([a_t[0], 0.16, 0]) print("Step", step, "Action", a_t, "Reward", r_t) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, a_t[0])) image = ob.img if step > 20: plt.imsave( datas_dir + ("%d-%d.jpg" % (collect_track_no, data_entry_count)), image) ret = file.write( "%f %f %f %f %f\n" % (ob.speedX, ob.speedY, ob.speedZ, pre_a_t, a_t[0])) if ret == 0: print("File Write error") data_entry_count += 1 s_t = s_t1 step += 1 pre_a_t = a_t[0] if done: break print(("TOTAL REWARD @ " + str(i) + "Collect", data_entry_count)) print(("Total Step: " + str(step))) print("") except: traceback.print_exc() with open((datas_dir + "exception"), 'w') as file: file.write(str(traceback.format_exc())) finally: file.close() env.end() end_time = time.time() with open(datas_dir + "log", 'w') as file: file.write("total_step = %d\n" % step) file.write("total_time = %s (s)\n" % str(end_time - start_time)) print("Finish.")
from unityagents import UnityEnvironment import numpy as np from ddpg import ddpg import matplotlib.pyplot as plt from ddpg_agent import Agent env = UnityEnvironment(file_name='env/Reacher_Linux/Reacher_Linux/Reacher.x86') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of actions and states action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) agent = Agent(state_size=state_size, action_size=action_size, random_seed=2) scores = ddpg(env, brain_name, agent, n_episodes=300, max_t=1000, print_every=100) print(scores)
discountFactor = d.get('discountFactor') explorationRate = d.get('explorationRate') learnStart = d.get('learnStart') memorySize = d.get('memorySize') current_epoch = d.get('current_epoch') stepCounter = d.get('stepCounter') loadsim_seconds = d.get('loadsim_seconds') clear_monitor_files(outdir) copy_tree(monitor_path, outdir) env = gym.wrappers.Monitor(env, outdir, resume=True) ddpg = ddpg.ddpg(S_DIM=S_DIM, A_DIM=A_DIM, EP_MAX=epochs, EP_LEN=episode_steps, GAMMA=discountFactor, A_LR=A_learningRate, C_LR=C_learningRate, BATCH=minibatch_size, propeller_hovering_speed=0.0) last100Rewards = [0] * 100 last100RewardsIndex = 0 last100Filled = False all_ep_r = [] start_time = time.time() # start iterating from 'current epoch'. for epoch in range(current_epoch + 1, epochs + 1, 1): observation = env.reset() cumulated_reward = 0
import numpy as np import matplotlib.pyplot as plt from ddpg import ddpg def smooth(x): # last 100 n = len(x) y = np.zeros(n) for i in range(n): start = max(0, i - 99) y[i] = float(x[start:(i + 1)].sum()) / (i - start + 1) return y returns, q_losses, mu_losses = ddpg(lambda: gym.make('Pendulum-v0'), num_train_episodes=50) plt.plot(returns) plt.plot(smooth(np.array(returns))) plt.title("Train returns") plt.show() # plt.plot(test_returns) # plt.plot(smooth(np.array(test_returns))) # plt.title("Test returns") # plt.show() plt.plot(q_losses) plt.title('q_losses') plt.show()
print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) agents = Agents(num_agents=num_agents, state_size=state_size, action_size=action_size, random_seed=0) scores = ddpg(env, brain_name, agents, n_episodes=n_episodes, eps_start=eps_start, eps_end=eps_end, eps_decay=eps_decay, resume=resume) # plot the scores plt.plot(np.arange(1, len(scores) + 1), np.mean(scores, axis=-1)) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() # close the environment env.close()
seed = 239 env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) gamma = 0.99 max_episodes = 50 buffer_maxlen = 5000 batch_size = 640 critic_lr = 1e-3 actor_lr = 1e-4 tau = 1e-3 agent = ddpg(state_dim, action_dim, gamma, tau, buffer_maxlen, batch_size, critic_lr, actor_lr) noise = OUNoise(env.action_space) step = 0 for episode in range(max_episodes): state = env.reset() total = 0 done = False while True: action = agent.act(state) action = noise.get_action(action, step) next_state, reward, done, _ = env.step(action) total += reward
def main(): EXPLORE = total_explore MAX_STEPS = max_steps MAX_STEPS_EP = max_steps_ep epsilon = epsilon_start # Creating necessary directories experiment_name = "img-0" experiment_dir = "experiment-%s/" % experiment_name models_dir = experiment_dir + "model/" logs_train_dir = experiment_dir + "logs-train/" if os.path.exists(experiment_dir) == False: os.mkdir(experiment_dir) if os.path.exists(logs_train_dir) == False: os.mkdir(logs_train_dir) if os.path.exists(models_dir) == False: os.mkdir(models_dir) description = 'Using raw pixels as input, output (steer)' + '\n' + \ 'Training from scratch' + '\n\n' \ 'throttle = 0.16' + '\n\n' \ 'brake = 0' + '\n\n' \ 'sp*np.cos(obs["angle"]) - np.abs(sp*np.sin(obs["angle"])) - sp * np.abs(obs["trackPos"]) \ - sp * np.abs(action_torcs["steer"]) * 4' + '\n\n' + \ 'env = TorcsEnv(vision=False, throttle=True, text_mode=False, track_no=5, random_track=False, track_range=(5, 8))' + '\n\n' \ 'abs(trackPos) > 0.9 is out of track' + '\n\n' \ with open(experiment_dir + "README.md", 'w') as file: file.write(description) file.write("\n\n") file.write(formatted_timestamp()) action_dim = 1 state_dim = 4 img_dim = [304, 412, 3] env_name = 'torcs' sess = tf.InteractiveSession() agent = ddpg(env_name, sess, state_dim, action_dim, models_dir, img_dim) agent.load_network() vision = True env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=5, random_track=False, track_range=(5, 8)) rewards_every_steps = np.zeros([MAX_STEPS]) actions_every_steps = np.zeros([MAX_STEPS, action_dim]) # sess.run(tf.initialize_all_variables()) # Using tensorboard to visualize data with tf.name_scope('summary'): critic_cost = tf.placeholder(dtype=tf.float32) actor_action = tf.placeholder(dtype=tf.float32) reward = tf.placeholder(dtype=tf.float32) state = tf.placeholder(dtype=tf.float32, shape=(state_dim, )) img = tf.placeholder(dtype=tf.float32, shape=(1, img_dim[0], img_dim[1], img_dim[2])) tf.summary.scalar("critic_cost", critic_cost) tf.summary.scalar('actor_action', actor_action) tf.summary.scalar('reward', reward) tf.summary.histogram('state', state) tf.summary.image("img", img) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(logs_train_dir, sess.graph) print("Training Start.") start_time = time.time() i = 0 step = 0 try: while step < MAX_STEPS: # if ((np.mod(i, 10) == 0 ) and (i>20)): # train_indicator= 0 # else: # train_indicator=is_training # restart because of memory leak bug in torcs if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() # Early episode annealing for out of track driving and small progress # During early training phases - out of track and slow driving is allowed as humans do ( Margin of error ) # As one learns to drive the constraints become stricter # random_number = random.random() # eps_early = max(epsilon,0.10) # if (random_number < (1.0-eps_early)) and (train_indicator == 1): # early_stop = 1 # else: # early_stop = 0 print(("Episode : " + str(i) + " Replay Buffer " + str(agent.replay_buffer.count()))) # s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, # 0.0)) s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, 0.0)) i_t = ob.img # cv2.imshow("img", ob.img) # cv2.waitKey(0) # x_t = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0)) # s_t = np.hstack((x_t, x_t, x_t, x_t)) total_reward = 0 step_ep = 0 while (step < MAX_STEPS) and (step_ep < MAX_STEPS_EP): # Take noisy actions during training epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, 0.0) a_t = agent.noise_action(s_t, epsilon, i_t) #ob, r_t, done, info = env.step(a_t[0], early_stop) ob, r_t, done, info = env.step([a_t[0], 0.16, 0]) # s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, # a_t[0])) s_t1 = np.hstack((ob.speedX, ob.speedY, ob.speedZ, a_t[0])) i_t1 = ob.img # x_t1 = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0])) # s_t1 = np.hstack((np.roll(s_t, -6)[:18], x_t1)) # s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0])) cost = agent.perceive(s_t, a_t, r_t, s_t1, done, i_t, i_t1) summary = sess.run(merged_summary, feed_dict={ critic_cost: cost, actor_action: a_t[0], reward: r_t, state: s_t, img: i_t.reshape(1, img_dim[0], img_dim[1], img_dim[2]) }) writer.add_summary(summary, step) total_reward += r_t s_t = s_t1 i_t = i_t1 print("Ep", i, "Total steps", step, "Reward", r_t, " Actions ", a_t, " Epsilon ", epsilon, "Step ep", step_ep) rewards_every_steps[step] = r_t actions_every_steps[step] = a_t step += 1 step_ep += 1 if done: break if np.mod(step + 1, 10000) == 0: print("Now we save model with step = ", step) agent.save_network(step + 1) print(("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward))) print(("Total Step: " + str(step))) print("") i += 1 except: traceback.print_exc() with open((logs_train_dir + "exception"), 'w') as file: file.write(str(traceback.format_exc())) finally: env.end() end_time = time.time() np.save(logs_train_dir + "reward.npy", rewards_every_steps) np.save(logs_train_dir + "action.npy", actions_every_steps) with open(logs_train_dir + "log", 'w') as file: file.write("epsilon_start = %d\n" % epsilon_start) file.write("total_explore = %d\n" % total_explore) file.write("total_episode = %d\n" % i) file.write("total_step = %d\n" % step) file.write("total_time = %s (s)\n" % str(end_time - start_time)) print("Finish.")
def main(): MAX_EP = 1 MAX_STEPS_EP = 2000 # Creating necessary directories test_track_no = 6 experiment_name = "tensorboard-11" experiment_dir = "experiment-%s/" % experiment_name models_dir = experiment_dir + "model/" logs_test_dir = experiment_dir + "logs-test-track-no-%d/" % test_track_no if os.path.exists(experiment_dir) == False: print("%s dosen't exists" % experiment_dir) return if os.path.exists(models_dir) == False: print("%s dosen't exists" % models_dir) return if os.path.exists(logs_test_dir) == False: os.mkdir(logs_test_dir) action_dim = 1 state_dim = 25 env_name = 'torcs' sess = tf.InteractiveSession() agent = ddpg(env_name, sess, state_dim, action_dim, models_dir) agent.load_network() vision = False env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=test_track_no, random_track=False, track_range=(0, 3)) # rewards_every_steps = np.zeros([MAX_EP, MAX_STEPS_EP]) # actions_every_steps = np.zeros([MAX_EP, MAX_STEPS_EP, action_dim]) # Using tensorboard to visualize data with tf.name_scope('summary'): actor_action = tf.placeholder(dtype=tf.float32) reward = tf.placeholder(dtype=tf.float32) state = tf.placeholder(dtype=tf.float32, shape=(state_dim, )) tf.summary.scalar('actor_action', actor_action) tf.summary.scalar('reward', reward) tf.summary.histogram('state', state) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(logs_test_dir, sess.graph) print("Testing Start.") start_time = time.time() step = 0 try: for i in range(MAX_EP): if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) else: ob = env.reset() print(("Episode : " + str(i) + " Replay Buffer " + str(agent.replay_buffer.count()))) # s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, # 0.0)) s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0)) # x_t = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0)) # s_t = np.hstack((x_t, x_t, x_t, x_t)) # s_t = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0)) # s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, 0.0)) total_reward = 0 step_ep = 0 while (step_ep < MAX_STEPS_EP): a_t = agent.action(s_t) ob, r_t, done, info = env.step([a_t[0], 0.16, 0]) # s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, # a_t[0])) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0])) # x_t1 = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0])) # s_t1 = np.hstack((np.roll(s_t, -6)[:18], x_t1)) # s_t1 = np.hstack((ob.angle, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0])) # s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, a_t[0])) summary = sess.run([merged_summary], feed_dict={ actor_action: a_t[0], reward: r_t, state: s_t }) writer.add_summary(summary[0], step) total_reward += r_t s_t = s_t1 print("Ep", i, "Total steps", step, "Reward", r_t, " Actions ", a_t, "Step ep", step_ep) # rewards_every_steps[step] = r_t # actions_every_steps[step] = a_t step += 1 step_ep += 1 if done: break print(("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward))) print(("Total Step: " + str(step))) print("") except: traceback.print_exc() with open((logs_test_dir + "exception"), 'w') as file: file.write(str(traceback.format_exc())) finally: env.end() end_time = time.time() # np.save(logs_test_dir + "reward.npy", rewards_every_steps) # np.save(logs_test_dir + "action.npy", actions_every_steps) with open(logs_test_dir + "log", 'w') as file: file.write("total_episode = %d\n" % MAX_EP) file.write("max_steps_ep = %d\n" % MAX_STEPS_EP) file.write("total_step = %d\n" % step) file.write("total_time = %s (s)\n" % str(end_time - start_time)) print("Finish.")