def __init__(self, env, device, model_dir, args): self.env = env self.env_name = args.env_name self.seed = args.seed self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.max_action = float(self.env.action_space.high[0]) self.batch_size = args.batch_size self.max_timesteps = args.max_timesteps self.gaussian_std = args.gaussian_std self.start_timesteps = args.start_timesteps self.eval_freq = args.eval_freq self.rand_action_p = args.rand_action_p self.model_dir = os.path.join(model_dir, f"{args.env_name}_{args.seed}") self.algo = DDPG(self.state_dim, self.action_dim, self.max_action, device) self.storage = ReplayBuffer(self.state_dim, self.action_dim, device) self.eval_rewards = [] self.total_steps = 0 self.episodes = 0 self.episode_steps = 0 self.episode_rewards = 0 self.state = None
def __init__(self): self.observation_space = SA_OBS_SPACE self.action_space = SA_ACTION_SPACE # self.agent = agent self.agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE, CA_ACTION_BOUND) self.dqn_solver = DQNSolver(SA_OBS_SPACE, SA_ACTION_SPACE) logging.basicConfig(file_name="logs/log.log", format='%(asctime)s %(message)s', file_mode='w+') self.logger = logging.getLogger() self.logger.setLevel(logging.DEBUG)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
class FCMADRL: def __init__(self): self.observation_space = SA_OBS_SPACE self.action_space = SA_ACTION_SPACE # self.agent = agent self.agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE, CA_ACTION_BOUND) self.dqn_solver = DQNSolver(SA_OBS_SPACE, SA_ACTION_SPACE) logging.basicConfig(file_name="logs/log.log", format='%(asctime)s %(message)s', file_mode='w+') self.logger = logging.getLogger() self.logger.setLevel(logging.DEBUG) def use_existing_dqn(self, dqn_model): self.dqn_solver.model = dqn_model def get_ddpg(self): return self.agent def get_dqn(self): return self.dqn_solver def get_dqn_model(self, dqn_solver): return dqn_solver.model """ ca_step() is just for testing purposes """ def ca_step(self, action): return np.random.choice( SA_ACTION_SPACE, CA_OBS_SPACE), np.random.choice(10), np.random.choice( [True, False]), {} """ sa_state(): To merge the two states received by the individual agents (one from central agent and one from the environment) into one vector """ def sa_state(self, x, obs, i): one = x two = obs[i] three = np.array([i]) f = np.append(one, two) f = np.append(f, three) return f def fcmadrl(self): # Randomly initialize critic,actor,target critic, target actor network and replay buffer exploration_noise = OUNoise(CA_ACTION_SPACE) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = CA_OBS_SPACE num_actions = CA_ACTION_SPACE self.logger.debug("Number of States:" + str(num_states)) self.logger.debug("Number of Actions:" + str(num_actions)) self.logger.debug("Number of Steps per episode:" + str(steps)) # saving reward: reward_st = np.array([0]) score_logger = ScoreLogger(ENV_NAME) # run = 0 for i in xrange(episodes): print "==== Starting episode no:", i, "====", "\n" # observation = env.reset() observation = ca_reset() reward_per_episode = 0 # run += 1 obs = env.reset() # step = 0 for t in xrange(steps): # rendering environment (optional) #env.render() print "Step: ", t x_arr = [] observation_arr = [] action_arr = [] action_n = [] state_arr = [] next_state_arr = [] action_n_arr = [] for z in range(env.n): self.take_action(action_arr, action_n, action_n_arr, exploration_noise, num_states, obs, observation, observation_arr, state_arr, x_arr, z) next_obs, reward_n, done_n, info_n = env.step(action_n) reward = reward_n[0] done = all(done_n) print "Reward_n: ", reward_n self.update_next_state(action_arr, next_obs, next_state_arr) self.memory_store(action_arr, action_n_arr, done_n, next_state_arr, observation_arr, reward_n, state_arr, x_arr) obs = next_obs # train critic and actor network if counter > 64: self.agent.train() reward_per_episode += reward counter += 1 # check if episode ends: if done or (t == steps - 1): print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode print "Printing reward to file" exploration_noise.reset( ) # reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('rewards/episode_reward.txt', reward_st, newline="\n") print "Run: " + str(i) + ", exploration: " + str( self.dqn_solver.exploration_rate) + ", score: " + str( reward_per_episode / t) score_logger.add_score(reward_per_episode / t, i) print '\n\n' break self.dqn_solver.experience_replay() if (i % CHECKPOINT == 0): self.dqn_solver.save_dqn_model(i) total_reward += reward_per_episode print "Average reward per episode {}".format(total_reward / episodes) return total_reward def update_next_state(self, action_arr, next_obs, next_state_arr): for z in range(env.n): ns = self.sa_state(action_arr[z], next_obs, z) ns = np.reshape(ns, [1, self.observation_space]) next_state_arr.append(ns) def take_action(self, action_arr, action_n, action_n_arr, exploration_noise, num_states, obs, observation, observation_arr, state_arr, x_arr, z): action = self.get_message(action_arr, exploration_noise, num_states, observation, x_arr) state = self.sa_state(action, obs, z) state = np.reshape(state, [1, self.observation_space]) state_arr.append(state) act = self.get_final_action(action_n, action_n_arr, state) self.logger.debug("SA_Action: " + str(act)) # print "CA State: ", x # print "CA Action: ", action # print "SA State: ", state # print "SA Action: ", act observation[z] = act observation_arr.append(np.array(list(observation))) def memory_store(self, action_arr, action_n_arr, done_n, next_state_arr, observation_arr, reward_n, state_arr, x_arr): for z in range(env.n): # add s_t,s_t+1,action,reward to experience memory # print x_arr[z], observation_arr[z], action_arr[z], reward_n[z], done_n[z] self.agent.add_experience(x_arr[z], observation_arr[z], action_arr[z], reward_n[z], done_n[z]) self.dqn_solver.remember(state_arr[z], action_n_arr[z], reward_n[z], next_state_arr[z], done_n[z]) def get_final_action(self, action_n, action_n_arr, state): act = self.dqn_solver.act(state) a = np.zeros(SA_ACTION_SPACE) a[act] = 1.0 action_n.append(a) action_n_arr.append(act) return act def get_message(self, action_arr, exploration_noise, num_states, observation, x_arr): x = observation # x_arr.append(x) x_arr.append(np.array(list(x))) action = self.agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise # Select action according to current policy and exploration noise action_arr.append(action) self.logger.debug("Action at Step: " + str(action)) # print "Action at step", t ," :",action,"\n" return action
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. saver = tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def run_pusher3dof(args, sim=True, vanilla=False): try: from hyperdash import Experiment hyperdash_support = True except: hyperdash_support = False env = NormalizedEnv(gym.make(args.env)) torques = [1.0] * 3 # if real colored = False if sim: torques = [args.t0, args.t1, args.t2] colored = True if not vanilla: env.env._init( torques=torques, colored=colored ) if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator( args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length ) exp = None if args.mode == 'train': if hyperdash_support: prefix = "real" if sim: prefix = "sim" exp = Experiment("s2r-pusher3dof-ddpg-{}".format(prefix)) import socket exp.param("host", socket.gethostname()) exp.param("type", prefix) # sim or real exp.param("vanilla", vanilla) # vanilla or not exp.param("torques", torques) exp.param("folder", args.output) for arg in ["env", "max_episode_length", "train_iter", "seed", "resume"]: arg_val = getattr(args, arg) exp.param(arg, arg_val) train(args, args.train_iter, agent, env, evaluate, args.validate_steps, args.output, max_episode_length=args.max_episode_length, debug=args.debug, exp=exp) # when done exp.end() elif args.mode == 'test': test(args.validate_episodes, agent, env, evaluate, args.resume, visualize=args.vis, debug=args.debug, load_best=args.best) else: raise RuntimeError('undefined mode {}'.format(args.mode))
"fthigh": 120, "fshin": 60, "ffoot": 30 }, colored=False ) if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length) exp = None if args.mode == 'train': exp = Experiment("sim2real-ddpg-real-cheetah") for arg in ["env", "rate", "prate", "hidden1", "hidden2", "warmup", "discount", "bsize", "rmsize", "window_length", "tau", "ou_theta", "ou_sigma", "ou_mu", "validate_episodes", "max_episode_length", "validate_steps", "init_w", "train_iter", "epsilon", "seed", "resume"]: arg_val = getattr(args, arg) import socket exp.param("host", socket.gethostname())
es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, ) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.99, scale_reward=args.reward_scale, qf_learning_rate=1e-3, policy_learning_rate=1e-4, plot=False) run_experiment_lite( algo.train(), log_dir=None if args.use_ec2 else args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed
) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(100,100), hidden_nonlinearity=tf.nn.relu,) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.99, scale_reward=args.reward_scale, qf_learning_rate=1e-3, policy_learning_rate=1e-4, plot=False ) run_experiment_lite( algo.train(), log_dir=None if args.use_ec2 else args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration
class Base: def __init__(self, env, device, model_dir, args): self.env = env self.env_name = args.env_name self.seed = args.seed self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.max_action = float(self.env.action_space.high[0]) self.batch_size = args.batch_size self.max_timesteps = args.max_timesteps self.gaussian_std = args.gaussian_std self.start_timesteps = args.start_timesteps self.eval_freq = args.eval_freq self.rand_action_p = args.rand_action_p self.model_dir = os.path.join(model_dir, f"{args.env_name}_{args.seed}") self.algo = DDPG(self.state_dim, self.action_dim, self.max_action, device) self.storage = ReplayBuffer(self.state_dim, self.action_dim, device) self.eval_rewards = [] self.total_steps = 0 self.episodes = 0 self.episode_steps = 0 self.episode_rewards = 0 self.state = None def iterate(self): assert self.state is not None self.episode_steps += 1 if self.is_random_action(): action = self.env.action_space.sample() else: action = (self.algo.select_action(np.array(self.state)) + np.random.normal(0, self.max_action * self.gaussian_std, size=self.action_dim)).clip( -self.max_action, self.max_action) next_state, reward, done, _ = self.env.step(action) done_bool = float( done) if self.episode_steps < self.env._max_episode_steps else 0 self.storage.add(self.state, action, next_state, reward, done_bool) self.state = next_state self.episode_rewards += reward if done: print(f"Total T: {self.total_steps + 1} " f"Episode Num: {self.episodes + 1} " f"Episode T: {self.episode_steps} " f"Reward: {self.episode_rewards:.3f}") # Reset environment self.state = self.env.reset() self.episode_rewards = 0 self.episode_steps = 0 self.episodes += 1 self.total_steps += 1 def evaluate(self, eval_episodes=10): eval_env = gym.make(self.env_name) eval_env.seed(self.seed + 100) avg_reward = 0. for _ in range(eval_episodes): state, done = eval_env.reset(), False while not done: action = self.algo.select_action(np.array(state)) state, reward, done, _ = eval_env.step(action) avg_reward += reward avg_reward /= eval_episodes print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") print("---------------------------------------") return avg_reward
def main(): if sys.platform.startswith('win'): # Add the _win_handler function to the windows console's handler function list win32api.SetConsoleCtrlHandler(_win_handler, True) if os.path.exists( os.path.join(config_file.config['config_file'], 'config.yaml')): config = sth.load_config(config_file.config['config_file']) else: config = config_file.config print(f'load config from config.') hyper_config = config['hyper parameters'] train_config = config['train config'] record_config = config['record config'] basic_dir = record_config['basic_dir'] last_name = record_config['project_name'] + '/' \ + record_config['remark'] \ + record_config['run_id'] cp_dir = record_config['checkpoint_basic_dir'] + last_name cp_file = cp_dir + '/rb' log_dir = record_config['log_basic_dir'] + last_name excel_dir = record_config['excel_basic_dir'] + last_name config_dir = record_config['config_basic_dir'] + last_name sth.check_or_create(basic_dir, 'basic') sth.check_or_create(cp_dir, 'checkpoints') sth.check_or_create(log_dir, 'logs(summaries)') sth.check_or_create(excel_dir, 'excel') sth.check_or_create(config_dir, 'config') logger = create_logger( name='logger', console_level=logging.INFO, console_format='%(levelname)s : %(message)s', logger2file=record_config['logger2file'], file_name=log_dir + '\log.txt', file_level=logging.WARNING, file_format= '%(lineno)d - %(asctime)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s' ) if train_config['train']: sth.save_config(config_dir, config) if train_config['unity_mode']: env = UnityEnvironment() else: env = UnityEnvironment( file_name=train_config['unity_file'], no_graphics=True if train_config['train'] else False, base_port=train_config['port']) brain_name = env.external_brain_names[0] brain = env.brains[brain_name] # set the memory use proportion of GPU tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 tf.reset_default_graph() graph = tf.Graph() with graph.as_default() as g: with tf.Session(graph=g, config=tf_config) as sess: logger.info('Algorithm: {0}'.format( train_config['algorithm'].name)) if train_config['algorithm'] == config_file.algorithms.ppo_sep_ac: from ppo.ppo_base import PPO_SEP model = PPO_SEP(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('PPO_SEP initialize success.') elif train_config['algorithm'] == config_file.algorithms.ppo_com: from ppo.ppo_base import PPO_COM model = PPO_COM(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('PPO_COM initialize success.') elif train_config['algorithm'] == config_file.algorithms.sac: from sac.sac import SAC model = SAC(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('SAC initialize success.') elif train_config['algorithm'] == config_file.algorithms.sac_no_v: from sac.sac_no_v import SAC_NO_V model = SAC_NO_V(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('SAC_NO_V initialize success.') elif train_config['algorithm'] == config_file.algorithms.ddpg: from ddpg.ddpg import DDPG model = DDPG(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('DDPG initialize success.') elif train_config['algorithm'] == config_file.algorithms.td3: from td3.td3 import TD3 model = TD3(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('TD3 initialize success.') recorder = Recorder(log_dir, excel_dir, record_config, logger, max_to_keep=5, pad_step_number=True, graph=g) episode = init_or_restore(cp_dir, sess, recorder, cp_file) try: if train_config['train']: train_OnPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=episode, model=model, recorder=recorder, cp_file=cp_file, hyper_config=hyper_config, train_config=train_config) if not train_config[ 'use_replay_buffer'] else train_OffPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=episode, model=model, recorder=recorder, cp_file=cp_file, hyper_config=hyper_config, train_config=train_config) tf.train.write_graph(g, cp_dir, 'raw_graph_def.pb', as_text=False) export_model(cp_dir, g) else: inference(env, brain_name, model, train_config) except Exception as e: logger.error(e) finally: env.close() recorder.close() sys.exit()
def main(): if sys.platform.startswith('win'): win32api.SetConsoleCtrlHandler(_win_handler, True) if train_config['unity_mode']: env = UnityEnvironment() else: env = UnityEnvironment( file_name=train_config['unity_file'], no_graphics=True if train_config['train'] else False, base_port=train_config['port']) brain_name = env.external_brain_names[0] brain = env.brains[brain_name] # set the memory use proportion of GPU tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 tf.reset_default_graph() graph = tf.Graph() with graph.as_default() as g: with tf.Session(graph=g, config=tf_config) as sess: print('Algorithm: {0}'.format(train_config['algorithm'].name)) if train_config['algorithm'] == algorithms.ppo_sep_ac: from ppo.ppo_base import PPO_SEP model = PPO_SEP(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('PPO_SEP initialize success.') elif train_config['algorithm'] == algorithms.ppo_com: from ppo.ppo_base import PPO_COM model = PPO_COM(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('PPO_COM initialize success.') elif train_config['algorithm'] == algorithms.sac: from sac.sac import SAC model = SAC(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('SAC initialize success.') elif train_config['algorithm'] == algorithms.sac_no_v: from sac.sac_no_v import SAC_NO_V model = SAC_NO_V(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('SAC_NO_V initialize success.') elif train_config['algorithm'] == algorithms.ddpg: from ddpg.ddpg import DDPG model = DDPG(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('DDPG initialize success.') elif train_config['algorithm'] == algorithms.td3: from td3.td3 import TD3 model = TD3(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('TD3 initialize success.') sess.run(tf.global_variables_initializer()) try: if train_config['train']: train_OnPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=0, model=model, hyper_config=hyper_config, train_config=train_config) if not train_config[ 'use_replay_buffer'] else train_OffPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=0, model=model, hyper_config=hyper_config, train_config=train_config) else: inference(env, brain_name, model, train_config) except Exception as e: print(e) finally: env.close() sys.exit()
def run_reacher(args, sim=True): try: from hyperdash import Experiment hyperdash_support = True except: hyperdash_support = False env = NormalizedEnv(gym.make(args.env)) torques = [200, 200] # if real colors = None if sim: torques = [args.t0, args.t1] colors = { "arenaBackground": ".27 .27 .81", "arenaBorders": "1.0 0.8 0.4", "arm0": "0.9 0.6 0.9", "arm1": "0.9 0.9 0.6" } env.env.env._init( # real robot torque0=torques[0], # torque of joint 1 torque1=torques[0], # torque of joint 2 topDown=True, colors=colors) if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length) exp = None if args.mode == 'train': if hyperdash_support: prefix = "real" if sim: prefix = "sim" exp = Experiment("s2r-reacher-ddpg-{}".format(prefix)) import socket exp.param("host", socket.gethostname()) exp.param("type", prefix) # sim or real exp.param("torques", [torques[0], torques[1]]) exp.param("folder", args.output) for arg in [ "env", "max_episode_length", "train_iter", "seed", "resume" ]: arg_val = getattr(args, arg) exp.param(arg, arg_val) train(args, args.train_iter, agent, env, evaluate, args.validate_steps, args.output, max_episode_length=args.max_episode_length, debug=args.debug, exp=exp) # when done exp.end() elif args.mode == 'test': test(args.validate_episodes, agent, env, evaluate, args.resume, visualize=args.vis, debug=args.debug, load_best=args.best) else: raise RuntimeError('undefined mode {}'.format(args.mode))