def main(): set_global_seeds(1) args = parse_args() with U.make_session(4) as sess: # noqa _, env = make_env(args.env) act = deepq.build_act( make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), q_func=dueling_model if args.dueling else model, num_actions=env.action_space.n) U.load_state(os.path.join(args.model_dir, "saved")) wang2015_eval(args.env, act, stochastic=args.stochastic)
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs): kwargs['logdir'] = logdir whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': sys.exit(0) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Write to temp directory for all non-master workers. actual_dir = None Logger.CURRENT.close() Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[]) logger.set_level(logger.DISABLED) # Create envs. if rank == 0: env = gym.make(env_id) if gym_monitor and logdir: env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True) env = SimpleMonitor(env) if evaluation: eval_env = gym.make(env_id) if gym_monitor and logdir: eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True) eval_env = SimpleMonitor(eval_env) else: eval_env = None else: env = gym.make(env_id) if evaluation: eval_env = gym.make(env_id) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() Logger.CURRENT.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
savedir = os.getenv('OPENAI_LOGDIR', None) if args.save_azure_container is not None: account_name, account_key, container_name = args.save_azure_container.split(":") container = Container(account_name=account_name, account_key=account_key, container_name=container_name, maybe_create=True) if savedir is None: # Careful! This will not get cleaned up. Docker spoils the developers. savedir = tempfile.TemporaryDirectory().name else: container = None # Create and seed the env. env, monitored_env = make_env(args.env) if args.seed > 0: set_global_seeds(args.seed) env.unwrapped.seed(args.seed) if args.gym_monitor and savedir: env = gym.wrappers.Monitor(env, os.path.join(savedir, 'gym_monitor'), force=True) if savedir: with open(os.path.join(savedir, 'args.json'), 'w') as f: json.dump(vars(args), f) with U.make_session(4) as sess: # Create training graph and replay buffer def model_wrapper(img_in, num_actions, scope, **kwargs): actual_model = dueling_model if args.dueling else model return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs) act, train, update_target, debug = deepq.build_train(
def run(env_id, seed, noise_type, layer_norm, evaluation, share_top_layer, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm, share_top_layer=share_top_layer) actor = Actor(nb_actions, layer_norm=layer_norm, share_top_layer=share_top_layer) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() if share_top_layer: reward_filename = 'episode_reward_share_top_layer.csv' else: reward_filename = 'episode_reward_normal.csv' training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, reward_filename=reward_filename, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components of DDPG memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=args.layer_norm) actor = Actor(nb_actions, layer_norm=args.layer_norm) # Seed everything to make things reproducible. seed = args.seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) # tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() # Derive the different numbers for the training process num_timesteps = args.num_timesteps nb_rollout_steps = args.nb_rollout_steps nb_epoch_cycles = args.nb_epoch_cycles nb_epochs = num_timesteps//(nb_rollout_steps*nb_epoch_cycles) # Just train
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, hidden_size, nb_layers, portnum, **kwargs): kwargs['logdir'] = logdir whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': sys.exit(0) # Configure things. rank = MPI.COMM_WORLD.Get_rank() utils.portnum = portnum + rank if rank != 0: # Write to temp directory for all non-master workers. actual_dir = None Logger.CURRENT.close() Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[]) logger.set_level(logger.DISABLED) # Create envs. if rank == 0: env = gym.make(env_id) if gym_monitor and logdir: env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True) env = SimpleMonitor(env) if evaluation: eval_env = gym.make(env_id) if gym_monitor and logdir: eval_env = gym.wrappers.Monitor(eval_env, os.path.join( logdir, 'gym_eval'), force=True) eval_env = SimpleMonitor(eval_env) else: eval_env = None else: env = gym.make(env_id) if evaluation: eval_env = gym.make(env_id) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_size=hidden_size, nb_layers=nb_layers, layer_norm=layer_norm) actor = Actor(nb_actions, layer_size=hidden_size, nb_layers=nb_layers, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() Logger.CURRENT.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
) from gym.envs.registration import register register( id='MonsterKong-v0', entry_point='baselines.ple.gym_env.monsterkong:MonsterKongEnv', kwargs={'map_config': map_config}, ) env = gym.make('MonsterKong-v0') env = ProcessFrame(env) else: env = create_atari_environment(args.env, sticky_actions=False) if args.seed > 0: set_global_seeds(args.seed) env.unwrapped.seed(args.seed) print("obs shape", env.observation_space.shape) env = GIFRecorder(video_path=args.video_path + "/{}/".format(args.comment), record_video=True, env=env) subdir = ( datetime.datetime.now()).strftime("%m-%d-%Y-%H:%M:%S") + " " + args.comment # tf_writer = tf.summary.FileWriter(os.path.join(args.log_dir, subdir), tf.get_default_graph()) value_summary = tf.Summary() qec_summary = tf.Summary() value_summary.value.add(tag='discount_reward_mean') value_summary.value.add(tag='non_discount_reward_mean') # value_summary.value.add(tag='episode') qec_summary.value.add(tag='qec_mean')
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = Rear_Wheel_Path_Tracking_Feedback() env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) #video_train = gym.wrappers.Monitor(env, '/home/jiameng/baselines/baselines/ddpg/controller_') controller = rear_wheel_feedback_control() if evaluation and rank == 0: eval_env = Rear_Wheel_Path_Tracking_Feedback() eval_env.eval_flag = True eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) #video_train = gym.wrappers.Monitor(env, '/home/jiameng/baselines/baselines/ddpg/controller') #video_eval = gym.wrappers.Monitor(eval_env, '/home/jiameng/baselines/baselines/ddpg/controller/eval') else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] #nb_actions = controller.params_space().shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. #memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training_controller.train(env=env, eval_env=eval_env, controller=controller, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, custom_log_dir, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) train_recording_path = os.path.join( custom_log_dir, env_id, 'train', datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) os.makedirs(train_recording_path) # Create envs. env = gym.make(env_id) env = TraceRecordingWrapper(env, directory=train_recording_path, buffer_batch_size=10) logger.info('TraceRecordingWrapper dir: {}'.format(env.directory)) # env = bench.Monitor(env, os.path.join(train_recording_path, 'log')) if evaluation and rank == 0: eval_recording_path = os.path.join( custom_log_dir, env_id, 'eval', datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) os.makedirs(eval_recording_path) eval_env = gym.make(env_id) eval_env = TraceRecordingWrapper(eval_env, directory=eval_recording_path, buffer_batch_size=10) logger.info('TraceRecordingWrapper eval dir: {}'.format( eval_env.directory)) # eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) # env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('DDPG: rank {}: seed={}, logdir={}'.format( rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) ######################################### DEFAULT DATA ####################################### history, abbreviation = read_stock_history(filepath='utils/datasets/stocks_history_target.h5') history = history[:, :, :4] history[:, 1:, 0] = history[:, 0:-1, 3] # correct opens target_stocks = abbreviation num_training_time = 1095 # get target history target_history = np.empty(shape=(len(target_stocks), num_training_time, history.shape[2])) for i, stock in enumerate(target_stocks): target_history[i] = history[abbreviation.index(stock), :num_training_time, :] print("target:", target_history.shape) testing_stocks = abbreviation test_history = np.empty(shape=(len(testing_stocks), history.shape[1] - num_training_time, history.shape[2])) for i, stock in enumerate(testing_stocks): test_history[i] = history[abbreviation.index(stock), num_training_time:, :] print("test:", test_history.shape) window_length = kwargs['window_length'] max_rollout_steps = kwargs['nb_rollout_steps'] ############################################################################################### train_env = PortfolioEnv(target_history, target_stocks, steps=min(max_rollout_steps, target_history.shape[1]-window_length-2), window_length=window_length) infer_train_env = PortfolioEnv(target_history, target_stocks, steps=target_history.shape[1]-window_length-2, window_length=window_length) infer_test_env = PortfolioEnv(test_history, testing_stocks, steps=test_history.shape[1]-window_length-2, window_length=window_length) kwargs['nb_eval_steps'] = infer_train_env.steps kwargs['nb_eval_test_steps'] = infer_test_env.steps print("SPACE:", train_env.observation_space.shape) # Parse noise_type action_noise = None param_noise = None nb_actions = train_env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=train_env.action_space.shape, observation_shape=train_env.observation_space.shape) critic = Critic(nb_actions, layer_norm=layer_norm, asset_features_shape=train_env.asset_features_shape) actor = Actor(nb_actions, layer_norm=layer_norm, asset_features_shape=train_env.asset_features_shape) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) train_env.seed(seed) infer_train_env.seed(seed) infer_test_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=train_env, train_eval_env=infer_train_env, test_eval_env=infer_test_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) train_env.close() infer_train_env.close() infer_test_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def runExp( checkpoint_file, logdir, omega, random_init, max_steps, hidden_layer_size, n_trajectories, file_suffix, restore_variables, overwrite_log, env_id, seed, exact, **kwargs, ): set_global_seeds(seed) # setup environments and policy if env_id == 1: env = CartPole(max_steps=max_steps) env_name = "cartPole" policy = Discrete(env.observation_space.shape[0], env.action_space_size, hidden_layer_size) model_approx = CartPoleActionNoise() if not exact: model_approx = NNModel(env.observation_space_size, 1, name=env_name) elif env_id == 2: env = Torcs(visual=False, port=kwargs["initial_port"]) policy = Gaussian( env.observation_space_size, env.action_space_size, hidden_layer_size=hidden_layer_size, ) env_name = "TORCS" model_approx = TorcsModel( env.observation_space_size, env.action_space_size, name=env_name + "2_actions", ) elif env_id == 3: env = NChainEnv(max_steps=max_steps) env_name = "chain" # initialize policy parameter if not random_init: init_theta = 0.2 else: init_theta = np.random.rand() policy = OneParameterPolicy(init_theta=init_theta) model_approx = ChainModel() else: raise ValueError("Wrong environment index") # initialize environment parameters if random_init: omega_bounds = env.get_params_bounds() omega = np.random.uniform(low=omega_bounds[:, 0], high=omega_bounds[:, 1]) env.set_params(omega) algo_name = "REMPS" experiment_name = (algo_name + "/" + env_name + "-omega" + str(omega) + "-traj" + str(n_trajectories) + "-DualReg" + str(kwargs["dual_reg"]) + "PolReg-" + str(kwargs["policy_reg"]) + "TrainingSet" + str(kwargs["training_set_size"])) if exact: experiment_name = experiment_name + "exact" experiment_name += str(seed) if file_suffix is not None: experiment_name = experiment_name + "-" + file_suffix if logdir is None: logdir = ("tf_logs/model_policy_logs/" + experiment_name + "eps-" + str(kwargs["epsilon"])) now = datetime.now() # do not overwrite log files if os.path.isdir(logdir) and (not overwrite_log): logdir = logdir + "-" + now.strftime("%Y%m%d-%H%M%S") + "/" if checkpoint_file is None: experiment_name = ("model-policy/" + experiment_name + "eps-" + str(kwargs["epsilon"])) checkpoint_file = "tf_checkpoint/" + experiment_name + "/" if not restore_variables: # do not overwrite checkpoint files if os.path.isdir(checkpoint_file): checkpoint_file = checkpoint_file[:-1] + now.strftime( "%Y%m%d-%H%M%S") + "/" else: os.makedirs(checkpoint_file) checkpoint_file += "model.ckpt" print("Logs will be saved into: " + logdir) print("Checkpoints will be saved into: " + checkpoint_file) remps_runner.train( env=env, policy=policy, model_approximator=model_approx, n_trajectories=n_trajectories, checkpoint_file=checkpoint_file, logdir=logdir, omega=omega, restore_variables=restore_variables, exact=exact, **kwargs, )
def run(seed, parameter_noise, layer_norm, evaluation, flip_state, full, action_repeat, fail_reward, exclude_centering_frame, checkpoint_dir, log_dir, session_path, last_training_step, integrator_accuracy, experiment_name, **kwargs): # we don't directly specify timesteps for this script, so make sure that if we do specify them # they agree with the other parameters if kwargs['num_timesteps'] is not None: assert(kwargs['num_timesteps'] == kwargs['nb_epochs'] * kwargs['nb_epoch_cycles'] * kwargs['nb_rollout_steps']) tmp_log, tmp_chkpt = get_log_and_checkpoint_dirs(experiment_name) if log_dir is None: log_dir = tmp_log if checkpoint_dir is None: checkpoint_dir = tmp_chkpt # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Main env env = create_environment(False, full, action_repeat, fail_reward, exclude_centering_frame, integrator_accuracy) env.reset() eval_env = None # Parse noise_type nb_actions = env.action_space.shape[-1] if parameter_noise: param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.2, desired_action_stddev=0.2) else: param_noise = None action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=0.2, theta=0.1) # Configure components. memory = ReplayBufferFlip(int(5e6), flip_state, env.get_observation_names(), env.action_space.shape, env.observation_space.shape) actor = Actor(nb_actions, layer_norm=layer_norm) critic = Critic(layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format( rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() # Create LearningSession was passed del kwargs['func'] sess_args = pack_run_params(seed, parameter_noise, layer_norm, evaluation, flip_state, full, action_repeat, fail_reward, exclude_centering_frame, **kwargs) learning_session = LearningSession( session_path, checkpoint_dir, log_dir, last_training_step, **sess_args) del kwargs['num_timesteps'] del kwargs['noise_type'] training.train(env=env, action_noise=action_noise, param_noise=param_noise, actor=actor, critic=critic, memory=memory, visualize=False, full=full, action_repeat=action_repeat, fail_reward=fail_reward, exclude_centering_frame=exclude_centering_frame, learning_session=learning_session, integrator_accuracy=integrator_accuracy, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank==0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))