def train(num_episodes=20000, ): """Train.""" # Create the task environment. env = gym.make(name) # Create the DDPG agent in the task environment. agent = DDPG(env) with open(name + '.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(labels) i_step = 0 for i_episode in range(1, num_episodes + 1): # start a new episode state = agent.reset() sum_reward = 0.0 N = 0 while True: # env.render() # Actor commands the action action = agent.act(state) # Environment reacts with next state, reward and done for # end-of-episode next_state, reward, done, info = env.step(action) # Agent (actor-critic) learns losses = agent.step(action, reward, next_state, done) # S <- S state = next_state sum_reward += reward N += 1 i_step += 1 # if i_step % 1000 == 0 and losses is not None: if done and losses is not None: loss_critic = losses # End of episode. Show metrics. to_write = (i_episode, i_step, loss_critic, sum_reward / N) print('\rEpisode: {:4d}, ' 'Step: {:7d}, ' 'Loss-crit: {:10.4f}, ' 'Av Rwd: {:10.4f}, ' ''.format(*to_write)) # Re-use same line to print on. # sys.stdout.flush() # Write CSV row for i, label in enumerate(labels): results[label].append(to_write[i]) writer.writerow(to_write) if done: break # Plot i_episode, loss_actor, loss_critic = zip(*telemetry)
def main(argv): env_name = FLAGS.env_name env = gym.make(env_name) agent = DDPG(env, load_path=FLAGS.load_path, training=False) for episodes in range(FLAGS.num_episodes): done = False obs = env.reset() episode_reward = 0 while not done: env.render() action = agent.act(obs, noise=False).flatten() obs, rew, done, info = env.step(action) obs = obs.flatten() episode_reward += rew print(f'Episode Reward:{episode_reward}') env.close()
writer = SummaryWriter(logdir=LOG_DIR) total_numsteps = 0 n_updates = 0 # number of policy updates for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() while not done: if total_numsteps < args.start_steps: action = env.action_space.sample() # Sample random action else: action = agent.act(state) # Sample action from policy next_state, reward, done, _ = env.step(action) # Step episode_steps += 1 total_numsteps += 1 episode_reward += reward # Ignore the "done" signal if it comes from hitting the time horizon. # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) mask = 1 if episode_steps == env._max_episode_steps else float( not done) agent.step(state, action, reward, next_state, mask) if total_numsteps >= args.start_steps and total_numsteps % args.update_freq == 0: critic_loss, actor_loss = agent.update() state = next_state
action_shape, batch_size=128, gamma=0.995, tau=0.001, actor_lr=0.0001, critic_lr=0.001, use_layer_norm=True) print('DDPG agent configured') agent.load_model(agent.current_path + '/model/model.ckpt') agent.load_memory() max_episode = 10000 tot_rewards = [] print('env reset') observation, done = env.reset() action = agent.act(observation) print(action) rospy.sleep(0.8) observation, reward, done = env.step(action) rospy.sleep(0.8) noise_sigma = 0.15 save_cutoff = 1 cutoff_count = 0 save_count = 0 curr_highest_eps_reward = -1000.0 for i in xrange(max_episode): if i % 100 == 0 and noise_sigma > 0.03: agent.noise = OUNoise(agent.nb_actions, sigma=noise_sigma) noise_sigma /= 2.0 step_num = 0 while done == False:
writer1 = csv.writer(fout1) writer1.writerow(labels) fout2 = open("physical_info.csv", 'w') labels = [ 'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity', 'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity', 'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4' ] writer2 = csv.writer(fout2) writer2.writerow(labels) for i_episode in range(1, num_episodes + 1): state = agent.reset() # start a new episode while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done) state = next_state # Write info to file to_write = [task.sim.time] + list( task.sim.pose[:3] ) #+ list(task.sim.v) + list(task.sim.angular_v) + list(action) fout2.write( "{:4.2f}, {:7.3f}, {:7.3f}, {:7.3f} {:7.3f} {:7.3f} {:7.3f} {:7.3f}\n" .format(to_write[0], to_write[1], to_write[2], to_write[3], action[0], action[1], action[2], action[3])) if done: print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f})".format(