def __init__(self, env, device, model_dir, args): self.env = env self.env_name = args.env_name self.seed = args.seed self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.max_action = float(self.env.action_space.high[0]) self.batch_size = args.batch_size self.max_timesteps = args.max_timesteps self.gaussian_std = args.gaussian_std self.start_timesteps = args.start_timesteps self.eval_freq = args.eval_freq self.rand_action_p = args.rand_action_p self.model_dir = os.path.join(model_dir, f"{args.env_name}_{args.seed}") self.algo = DDPG(self.state_dim, self.action_dim, self.max_action, device) self.storage = ReplayBuffer(self.state_dim, self.action_dim, device) self.eval_rewards = [] self.total_steps = 0 self.episodes = 0 self.episode_steps = 0 self.episode_rewards = 0 self.state = None
def __init__(self): self.observation_space = SA_OBS_SPACE self.action_space = SA_ACTION_SPACE # self.agent = agent self.agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE, CA_ACTION_BOUND) self.dqn_solver = DQNSolver(SA_OBS_SPACE, SA_ACTION_SPACE) logging.basicConfig(file_name="logs/log.log", format='%(asctime)s %(message)s', file_mode='w+') self.logger = logging.getLogger() self.logger.setLevel(logging.DEBUG)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def run_pusher3dof(args, sim=True, vanilla=False): try: from hyperdash import Experiment hyperdash_support = True except: hyperdash_support = False env = NormalizedEnv(gym.make(args.env)) torques = [1.0] * 3 # if real colored = False if sim: torques = [args.t0, args.t1, args.t2] colored = True if not vanilla: env.env._init( torques=torques, colored=colored ) if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator( args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length ) exp = None if args.mode == 'train': if hyperdash_support: prefix = "real" if sim: prefix = "sim" exp = Experiment("s2r-pusher3dof-ddpg-{}".format(prefix)) import socket exp.param("host", socket.gethostname()) exp.param("type", prefix) # sim or real exp.param("vanilla", vanilla) # vanilla or not exp.param("torques", torques) exp.param("folder", args.output) for arg in ["env", "max_episode_length", "train_iter", "seed", "resume"]: arg_val = getattr(args, arg) exp.param(arg, arg_val) train(args, args.train_iter, agent, env, evaluate, args.validate_steps, args.output, max_episode_length=args.max_episode_length, debug=args.debug, exp=exp) # when done exp.end() elif args.mode == 'test': test(args.validate_episodes, agent, env, evaluate, args.resume, visualize=args.vis, debug=args.debug, load_best=args.best) else: raise RuntimeError('undefined mode {}'.format(args.mode))
"fthigh": 120, "fshin": 60, "ffoot": 30 }, colored=False ) if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length) exp = None if args.mode == 'train': exp = Experiment("sim2real-ddpg-real-cheetah") for arg in ["env", "rate", "prate", "hidden1", "hidden2", "warmup", "discount", "bsize", "rmsize", "window_length", "tau", "ou_theta", "ou_sigma", "ou_mu", "validate_episodes", "max_episode_length", "validate_steps", "init_w", "train_iter", "epsilon", "seed", "resume"]: arg_val = getattr(args, arg) import socket exp.param("host", socket.gethostname())
es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, ) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.99, scale_reward=args.reward_scale, qf_learning_rate=1e-3, policy_learning_rate=1e-4, plot=False) run_experiment_lite( algo.train(), log_dir=None if args.use_ec2 else args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed
def main(): if sys.platform.startswith('win'): # Add the _win_handler function to the windows console's handler function list win32api.SetConsoleCtrlHandler(_win_handler, True) if os.path.exists( os.path.join(config_file.config['config_file'], 'config.yaml')): config = sth.load_config(config_file.config['config_file']) else: config = config_file.config print(f'load config from config.') hyper_config = config['hyper parameters'] train_config = config['train config'] record_config = config['record config'] basic_dir = record_config['basic_dir'] last_name = record_config['project_name'] + '/' \ + record_config['remark'] \ + record_config['run_id'] cp_dir = record_config['checkpoint_basic_dir'] + last_name cp_file = cp_dir + '/rb' log_dir = record_config['log_basic_dir'] + last_name excel_dir = record_config['excel_basic_dir'] + last_name config_dir = record_config['config_basic_dir'] + last_name sth.check_or_create(basic_dir, 'basic') sth.check_or_create(cp_dir, 'checkpoints') sth.check_or_create(log_dir, 'logs(summaries)') sth.check_or_create(excel_dir, 'excel') sth.check_or_create(config_dir, 'config') logger = create_logger( name='logger', console_level=logging.INFO, console_format='%(levelname)s : %(message)s', logger2file=record_config['logger2file'], file_name=log_dir + '\log.txt', file_level=logging.WARNING, file_format= '%(lineno)d - %(asctime)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s' ) if train_config['train']: sth.save_config(config_dir, config) if train_config['unity_mode']: env = UnityEnvironment() else: env = UnityEnvironment( file_name=train_config['unity_file'], no_graphics=True if train_config['train'] else False, base_port=train_config['port']) brain_name = env.external_brain_names[0] brain = env.brains[brain_name] # set the memory use proportion of GPU tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 tf.reset_default_graph() graph = tf.Graph() with graph.as_default() as g: with tf.Session(graph=g, config=tf_config) as sess: logger.info('Algorithm: {0}'.format( train_config['algorithm'].name)) if train_config['algorithm'] == config_file.algorithms.ppo_sep_ac: from ppo.ppo_base import PPO_SEP model = PPO_SEP(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('PPO_SEP initialize success.') elif train_config['algorithm'] == config_file.algorithms.ppo_com: from ppo.ppo_base import PPO_COM model = PPO_COM(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('PPO_COM initialize success.') elif train_config['algorithm'] == config_file.algorithms.sac: from sac.sac import SAC model = SAC(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('SAC initialize success.') elif train_config['algorithm'] == config_file.algorithms.sac_no_v: from sac.sac_no_v import SAC_NO_V model = SAC_NO_V(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('SAC_NO_V initialize success.') elif train_config['algorithm'] == config_file.algorithms.ddpg: from ddpg.ddpg import DDPG model = DDPG(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('DDPG initialize success.') elif train_config['algorithm'] == config_file.algorithms.td3: from td3.td3 import TD3 model = TD3(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('TD3 initialize success.') recorder = Recorder(log_dir, excel_dir, record_config, logger, max_to_keep=5, pad_step_number=True, graph=g) episode = init_or_restore(cp_dir, sess, recorder, cp_file) try: if train_config['train']: train_OnPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=episode, model=model, recorder=recorder, cp_file=cp_file, hyper_config=hyper_config, train_config=train_config) if not train_config[ 'use_replay_buffer'] else train_OffPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=episode, model=model, recorder=recorder, cp_file=cp_file, hyper_config=hyper_config, train_config=train_config) tf.train.write_graph(g, cp_dir, 'raw_graph_def.pb', as_text=False) export_model(cp_dir, g) else: inference(env, brain_name, model, train_config) except Exception as e: logger.error(e) finally: env.close() recorder.close() sys.exit()
def main(): if sys.platform.startswith('win'): win32api.SetConsoleCtrlHandler(_win_handler, True) if train_config['unity_mode']: env = UnityEnvironment() else: env = UnityEnvironment( file_name=train_config['unity_file'], no_graphics=True if train_config['train'] else False, base_port=train_config['port']) brain_name = env.external_brain_names[0] brain = env.brains[brain_name] # set the memory use proportion of GPU tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 tf.reset_default_graph() graph = tf.Graph() with graph.as_default() as g: with tf.Session(graph=g, config=tf_config) as sess: print('Algorithm: {0}'.format(train_config['algorithm'].name)) if train_config['algorithm'] == algorithms.ppo_sep_ac: from ppo.ppo_base import PPO_SEP model = PPO_SEP(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('PPO_SEP initialize success.') elif train_config['algorithm'] == algorithms.ppo_com: from ppo.ppo_base import PPO_COM model = PPO_COM(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('PPO_COM initialize success.') elif train_config['algorithm'] == algorithms.sac: from sac.sac import SAC model = SAC(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('SAC initialize success.') elif train_config['algorithm'] == algorithms.sac_no_v: from sac.sac_no_v import SAC_NO_V model = SAC_NO_V(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('SAC_NO_V initialize success.') elif train_config['algorithm'] == algorithms.ddpg: from ddpg.ddpg import DDPG model = DDPG(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('DDPG initialize success.') elif train_config['algorithm'] == algorithms.td3: from td3.td3 import TD3 model = TD3(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('TD3 initialize success.') sess.run(tf.global_variables_initializer()) try: if train_config['train']: train_OnPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=0, model=model, hyper_config=hyper_config, train_config=train_config) if not train_config[ 'use_replay_buffer'] else train_OffPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=0, model=model, hyper_config=hyper_config, train_config=train_config) else: inference(env, brain_name, model, train_config) except Exception as e: print(e) finally: env.close() sys.exit()
def run_reacher(args, sim=True): try: from hyperdash import Experiment hyperdash_support = True except: hyperdash_support = False env = NormalizedEnv(gym.make(args.env)) torques = [200, 200] # if real colors = None if sim: torques = [args.t0, args.t1] colors = { "arenaBackground": ".27 .27 .81", "arenaBorders": "1.0 0.8 0.4", "arm0": "0.9 0.6 0.9", "arm1": "0.9 0.9 0.6" } env.env.env._init( # real robot torque0=torques[0], # torque of joint 1 torque1=torques[0], # torque of joint 2 topDown=True, colors=colors) if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length) exp = None if args.mode == 'train': if hyperdash_support: prefix = "real" if sim: prefix = "sim" exp = Experiment("s2r-reacher-ddpg-{}".format(prefix)) import socket exp.param("host", socket.gethostname()) exp.param("type", prefix) # sim or real exp.param("torques", [torques[0], torques[1]]) exp.param("folder", args.output) for arg in [ "env", "max_episode_length", "train_iter", "seed", "resume" ]: arg_val = getattr(args, arg) exp.param(arg, arg_val) train(args, args.train_iter, agent, env, evaluate, args.validate_steps, args.output, max_episode_length=args.max_episode_length, debug=args.debug, exp=exp) # when done exp.end() elif args.mode == 'test': test(args.validate_episodes, agent, env, evaluate, args.resume, visualize=args.vis, debug=args.debug, load_best=args.best) else: raise RuntimeError('undefined mode {}'.format(args.mode))