def experiment(n_epochs, n_episodes): np.random.seed() logger = Logger(COPDAC_Q.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__) # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes) logger.info('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, features=phi, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J))
def test_torch_ensemble_logger(tmpdir): np.random.seed(1) torch.manual_seed(1) logger = Logger('ensemble_logger', results_dir=tmpdir, use_timestamp=True) approximator = Regressor(TorchApproximator, input_shape=(4,), output_shape=(2,), n_models=3, network=ExampleNet, optimizer={'class': optim.Adam, 'params': {}}, loss=F.mse_loss, batch_size=100, quiet=True) approximator.set_logger(logger) x = np.random.rand(1000, 4) y = np.random.rand(1000, 2) for i in range(50): approximator.fit(x, y) loss_file = np.load(logger.path / 'loss.npy') assert loss_file.shape == (50, 3) assert np.allclose(loss_file[0], np.array([0.29083753, 0.86829887, 1.0505845])) and \ np.allclose(loss_file[-1], np.array([0.09410495, 0.18786799, 0.15016919]))
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() print('============ start experiment ============') logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = GraspEnv() print('============ mdp ============') # Policy n_weights = 6 mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0]) sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1, 0.1]) #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4]) policy = Own_policy() dist = GaussianDiagonalDistribution( mu, sigma) # TODO: is this distribution right? Yes. agent = alg(mdp.info, dist, policy, **params) # Train dataset_callback = CollectDataset( ) # TODO: should we also collect the dataset? Just keep this. core = Core(agent, mdp, callbacks_fit=[dataset_callback]) #core = Core(agent, mdp) for i in range(n_epochs): print('================ core learn ================') core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) print('J:', J) print('============================') dataset_callback.clean() # Todo: learning curve? Done p = dist.get_parameters() print('p:', p) mu_0.append(p[:n_weights][0]) mu_1.append(p[:n_weights][1]) mu_2.append(p[:n_weights][2]) mu_3.append(p[:n_weights][3]) mu_4.append(p[:n_weights][4]) mu_5.append(p[:n_weights][5]) current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] + p[n_weights:][2] + p[n_weights:][3] + p[n_weights:][4] + p[n_weights:][5]) / 6 avg_sigma.append(current_avg_sigma) # record learning curve of cumulative rewards logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) list_J.append(np.mean(J))
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 0.25 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=1e-2) algorithm_params = dict(optimizer=optimizer) agent = alg(mdp.info, policy, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights().tolist()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J), policy_weights=policy.get_weights().tolist())
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, dist, policy, **params) # Train dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) logger.info('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), distribution_parameters=distribution.get_parameters()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info( i + 1, J=np.mean(J), distribution_parameters=distribution.get_parameters())
def experiment(): np.random.seed() logger = Logger(QLearning.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + QLearning.__name__) # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(mdp.info, pi, **algorithm_params) # Core core = Core(agent, mdp) # Initial policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.info(f'J start: {J}') # Train core.learn(n_steps=10000, n_steps_per_fit=1) # Final Policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.info(f'J final: {J}')
def test_dqn_logger(tmpdir): logger = Logger('dqn_logger', results_dir=tmpdir, use_timestamp=True) params = dict(batch_size=50, initial_replay_size=50, max_replay_size=500, target_update_frequency=50) learn(DQN, params, logger) loss_file = np.load(logger.path / 'loss_Q.npy') assert loss_file.shape == (90, ) assert loss_file[0] == 0.9765409231185913 and loss_file[ -1] == 0.6936992406845093
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_mem.add_argument("--prioritized", action='store_true', help='Whether to use prioritized memory or not.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument( "--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use.') arg_net.add_argument("--learning-rate", type=float, default=.0001, help='Learning rate value of the optimizer.') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered and' 'rmsprop') arg_net.add_argument("--epsilon", type=float, default=1e-8, help='Epsilon term used in rmspropcentered and' 'rmsprop') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=[ 'dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn', 'dueldqn', 'ndqn', 'rainbow' ], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument( "--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "AveragedDQN or MaxminDQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of collected samples before each' 'evaluation. An epoch ends after this number of' 'steps') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of collected samples before each fit of' 'the neural network.') arg_alg.add_argument("--max-steps", type=int, default=5000000, help='Total number of collected samples.') arg_alg.add_argument( "--final-exploration-frame", type=int, default=10000000, help='Number of collected samples until the exploration' 'rate stops decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-episodes", type=int, default=5, help='Number of episodes for each evaluation.') arg_alg.add_argument( "--alpha-coeff", type=float, default=.6, help='Prioritization exponent for prioritized experience replay.') arg_alg.add_argument("--n-atoms", type=int, default=51, help='Number of atoms for Categorical DQN.') arg_alg.add_argument("--v-min", type=int, default=-10, help='Minimum action-value for Categorical DQN.') arg_alg.add_argument("--v-max", type=int, default=10, help='Maximum action-value for Categorical DQN.') arg_alg.add_argument("--n-steps-return", type=int, default=3, help='Number of steps for n-step return for Rainbow.') arg_alg.add_argument("--sigma-coeff", type=float, default=.5, help='Sigma0 coefficient for noise initialization in' 'NoisyDQN and Rainbow.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--use-cuda', action='store_true', help='Flag specifying whether to use the GPU.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the grid.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() optimizer = dict() if args.optimizer == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon) elif args.optimizer == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon, centered=True) else: raise ValueError # Summary folder folder_name = './logs/habitat_nav_' + args.algorithm +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') pathlib.Path(folder_name).mkdir(parents=True) # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_episodes = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_episodes = args.test_episodes evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP config_file = os.path.join( pathlib.Path(__file__).parent.resolve(), 'pointnav_apartment-0.yaml') # Custom task for Replica scenes wrapper = 'HabitatNavigationWrapper' mdp = Habitat(wrapper, config_file) opt_return = mdp.env.get_optimal_policy_return() if args.load_path: logger = Logger(DQN.__name__, results_dir=None) logger.strong_line() logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return)) logger.info('Experiment Algorithm: ' + DQN.__name__) # Agent agent = DQN.load(args.load_path) epsilon_test = Parameter(value=args.test_exploration_rate) agent.policy.set_epsilon(epsilon_test) # Algorithm core_test = Core(agent, mdp) # Evaluate model dataset = core_test.evaluate(n_episodes=args.test_episodes, render=args.render, quiet=args.quiet) get_stats(dataset, logger) else: # Policy epsilon = LinearParameter(value=args.initial_exploration_rate, threshold_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator approximator_params = dict( network=Network if args.algorithm not in ['dueldqn', 'cdqn', 'ndqn', 'rainbow'] else FeatureNetwork, input_shape=mdp.info.observation_space.shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, n_features=Network.n_features, optimizer=optimizer, use_cuda=args.use_cuda) if args.algorithm not in ['cdqn', 'rainbow']: approximator_params['loss'] = F.smooth_l1_loss approximator = TorchApproximator if args.prioritized: replay_memory = PrioritizedReplayMemory( initial_replay_size, max_replay_size, alpha=args.alpha_coeff, beta=LinearParameter(.4, threshold_value=1, n=max_steps // train_frequency)) else: replay_memory = None # Agent algorithm_params = dict( batch_size=args.batch_size, target_update_frequency=target_update_frequency // train_frequency, replay_memory=replay_memory, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size) if args.algorithm == 'dqn': alg = DQN agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': alg = DoubleDQN agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': alg = AveragedDQN agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, n_approximators=args.n_approximators, **algorithm_params) elif args.algorithm == 'mmdqn': alg = MaxminDQN agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, n_approximators=args.n_approximators, **algorithm_params) elif args.algorithm == 'dueldqn': alg = DuelingDQN agent = alg(mdp.info, pi, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'cdqn': alg = CategoricalDQN agent = alg(mdp.info, pi, approximator_params=approximator_params, n_atoms=args.n_atoms, v_min=args.v_min, v_max=args.v_max, **algorithm_params) elif args.algorithm == 'ndqn': alg = NoisyDQN agent = alg(mdp.info, pi, approximator_params=approximator_params, sigma_coeff=args.sigma_coeff, **algorithm_params) elif args.algorithm == 'rainbow': alg = Rainbow beta = LinearParameter(.4, threshold_value=1, n=max_steps // train_frequency) agent = alg(mdp.info, pi, approximator_params=approximator_params, n_atoms=args.n_atoms, v_min=args.v_min, v_max=args.v_max, n_steps_return=args.n_steps_return, alpha_coeff=args.alpha_coeff, beta=beta, sigma_coeff=args.sigma_coeff, **algorithm_params) logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return)) logger.info('Experiment Algorithm: ' + alg.__name__) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0, logger) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.save(folder_name + '/agent_0.msh') # Evaluate initial policy pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_episodes=test_episodes, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, logger)) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch, logger) logger.info('- Learning:') # learning step pi.set_epsilon(epsilon) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh') logger.info('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_episodes=test_episodes, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, logger)) np.save(folder_name + '/scores.npy', scores) return scores
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() logger = Logger(DQN.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + DQN.__name__) # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearParameter(value=1., threshold_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, batch_size=batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, target_update_frequency=target_update_frequency) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) logger.epoch_info(0, J=np.mean(J)) for n in trange(n_epochs): pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) logger.epoch_info(n + 1, J=np.mean(J)) logger.info('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
from mushroom_rl.core import Logger # Create a logger object, creating a log folder logger = Logger('tutorial', results_dir='/tmp/logs', log_console=True) # Create a logger object, without creating the log folder logger_no_folder = Logger('tutorial_no_folder', results_dir=None) # Write a line of hashtags, to be used as a separator logger.strong_line() # Print an info message logger.debug('This is a debug message') # Print an info message logger.info('This is an info message') # Print a warning logger.warning('This is a warning message') # Print an error logger.error('This is an error message') # Print a critical error message logger.critical('This is a critical error') # Print a line of dashes, to be used as a (weak) separator logger.weak_line() # Exception logging
agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) core.evaluate(n_episodes=3, render=True) # Train core.learn(n_episodes=500, n_episodes_per_fit=500) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) core.evaluate(n_steps=100, render=True) return np.mean(episodes_length(dataset)) if __name__ == '__main__': n_experiment = 1 logger = Logger(LSPI.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + LSPI.__name__) steps = experiment() logger.info('Final episode length: %d' % steps)
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test): np.random.seed(1) logger = Logger('SAC', results_dir=None) logger.strong_line() logger.info('Humanoid Experiment, Algorithm: SAC') # MDP gamma = 0.99 horizon = 2000 mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles) # Agent agent = create_SAC_agent(mdp) # normalization callback normalizer = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info) # Algorithm(with normalization and plotting) core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer]) dataset = core.evaluate(n_episodes=n_episodes_test, render=True) J = np.mean(compute_J(dataset, gamma)) L = int(np.round(np.mean(episodes_length(dataset)))) logger.epoch_info(0, J=J, episode_lenght=L) # training loop for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_episodes=n_episodes_test, render=True) J = np.mean(compute_J(dataset, gamma)) L = int(np.round(np.mean(episodes_length(dataset)))) logger.epoch_info(n+1, J=J, episode_lenght=L) logger.info('Press a button to visualize humanoid') input() core.evaluate(n_episodes=10, render=True)
def test_logger(tmpdir): logger_1 = Logger('test', seed=1, results_dir=tmpdir) logger_2 = Logger('test', seed=2, results_dir=tmpdir) for i in range(3): logger_1.log_numpy(a=i, b=2 * i + 1) logger_2.log_numpy(a=2 * i + 1, b=i) a_1 = np.load(str(tmpdir / 'test' / 'a-1.npy')) a_2 = np.load(str(tmpdir / 'test' / 'a-2.npy')) b_1 = np.load(str(tmpdir / 'test' / 'b-1.npy')) b_2 = np.load(str(tmpdir / 'test' / 'b-2.npy')) assert np.array_equal(a_1, np.arange(3)) assert np.array_equal(b_2, np.arange(3)) assert np.array_equal(a_1, b_2) assert np.array_equal(b_1, a_2) logger_1_bis = Logger('test', append=True, seed=1, results_dir=tmpdir) logger_1_bis.log_numpy(a=3, b=7) a_1 = np.load(str(tmpdir / 'test' / 'a-1.npy')) b_2 = np.load(str(tmpdir / 'test' / 'b-2.npy')) assert np.array_equal(a_1, np.arange(4)) assert np.array_equal(b_2, np.arange(3))
def experiment(alg, n_epochs, n_steps, n_steps_test): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) use_cuda = torch.cuda.is_available() # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v1', horizon, gamma) # Policy policy_class = OrnsteinUhlenbeckPolicy policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2) # Settings initial_replay_size = 500 max_replay_size = 5000 batch_size = 200 n_features = 80 tau = .001 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) critic_params = dict(network=CriticNetwork, optimizer={'class': optim.Adam, 'params': {'lr': .001}}, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1,), use_cuda=use_cuda) # Agent agent = alg(mdp.info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) J = np.mean(compute_J(dataset, gamma)) R = np.mean(compute_J(dataset)) logger.epoch_info(0, J=J, R=R) for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = np.mean(compute_J(dataset, gamma)) R = np.mean(compute_J(dataset)) logger.epoch_info(n+1, J=J, R=R) logger.info('Press a button to visualize pendulum') input() core.evaluate(n_episodes=5, render=True)
callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get() return reward, max_Qs if __name__ == '__main__': n_experiment = 10000 logger = Logger(QLearning.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + QLearning.__name__) names = { 1: '1', .8: '08', QLearning: 'Q', DoubleQLearning: 'DQ', WeightedQLearning: 'WQ', SpeedyQLearning: 'SPQ', SARSA: 'SARSA' } for e in [1, .8]: logger.info(f'Exp: {e}')
def experiment(n_epochs, n_episodes): np.random.seed() logger = Logger(StochasticAC_AVG.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + StochasticAC_AVG.__name__) # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 11 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings - 1, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate( 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) agent = StochasticAC_AVG(mdp.info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) # Train dataset_callback = CollectDataset() display_callback = Display(agent._V, mu, std, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, psi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.) dataset_callback.clean() display_callback() logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes) logger.info('Press a button to visualize the pendulum...') input() core.evaluate(n_steps=n_steps, render=True)
algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(mdp.info, pi, approximator_params=approximator_params, features=features, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=40, n_steps_per_fit=1, render=False) dataset = core.evaluate(n_episodes=1, render=True) return np.mean(compute_J(dataset, 1.)) if __name__ == '__main__': n_experiment = 1 logger = Logger(TrueOnlineSARSALambda.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + TrueOnlineSARSALambda.__name__) alpha = .1 Js = Parallel(n_jobs=-1)(delayed(experiment)(alpha) for _ in range(n_experiment)) logger.info('J: %f' % np.mean(Js))
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_step_test, alg_params, policy_params): logger = Logger(A2C.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + A2C.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, n_features=64, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) alg_params['critic_params'] = critic_params policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(0, J=J, R=R, entropy=E) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(it + 1, J=J, R=R, entropy=E) logger.info('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() logger = Logger('plot_and_norm_example', results_dir=None) logger.strong_line() logger.info('Plotting and normalization example') # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(optimizer=optimizer) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.epoch_info(n + 1, J=J) if save_states_to_disk: # save normalization / plot states to disk path logger.info('Saving plotting and normalization data') os.makedirs("./logs/plot_and_norm", exist_ok=True) prepro.save("./logs/plot_and_norm/preprocessor.msh") plotter.save_state("./logs/plot_and_norm/plotting_state") # load states from disk path logger.info('Loading preprocessor and plotter') prerpo = MinMaxPreprocessor.load( "./logs/plot_and_norm/preprocessor.msh") plotter.load_state("./logs/plot_and_norm/plotting_state")
action='store_true', help='Flag to run a reduced version of the benchmark.') args = vars(parser.parse_args()) return args.values() if __name__ == '__main__': env_ids, exec_type, test, demo = get_args() cfg_dir = Path(__file__).parent / 'cfg' env_cfg_dir = cfg_dir / 'env' param_path = 'suite.yaml' plots_path = 'plots.yaml' logger = Logger(results_dir=None) logger.info('Starting benchmarking script') if 'all' in env_ids: logger.info('Running benchmark on all available environments') assert len(env_ids) == 1 env_ids = list() for env_id in env_cfg_dir.iterdir(): if env_id.suffix == '.yaml': env_ids.append(env_id.stem) logger.info('Execution type: ' + exec_type) logger.info('Runing FULL: ' + str(not demo)) logger.strong_line()
def experiment(): np.random.seed() logger = Logger(DQN.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + DQN.__name__) # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_mem.add_argument("--prioritized", action='store_true', help='Whether to use prioritized memory or not.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='rmsprop', help='Name of the optimizer to use.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer.') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered and' 'rmsprop') arg_net.add_argument("--epsilon", type=float, default=1e-8, help='Epsilon term used in rmspropcentered and' 'rmsprop') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn', 'dueldqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "AveragedDQN or MaxminDQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of collected samples before each' 'evaluation. An epoch ends after this number of' 'steps') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of collected samples before each fit of' 'the neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of collected samples.') arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, help='Number of collected samples until the exploration' 'rate stops decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of collected samples for each' 'evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=30, help='Maximum number of no-op actions performed at the' 'beginning of the episodes.') arg_alg.add_argument("--n-atoms", type=int, default=51, help='Number of atoms for Categorical DQN.') arg_alg.add_argument("--v-min", type=int, default=-10, help='Minimum action-value for Categorical DQN.') arg_alg.add_argument("--v-max", type=int, default=10, help='Maximum action-value for Categorical DQN.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--use-cuda', action='store_true', help='Flag specifying whether to use the GPU.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() optimizer = dict() if args.optimizer == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon) elif args.optimizer == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon, centered=True) else: raise ValueError # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') pathlib.Path(folder_name).mkdir(parents=True) # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions) if args.load_path: # Agent agent = DQN.load(args.load_path) epsilon_test = Parameter(value=args.test_exploration_rate) agent.policy.set_epsilon(epsilon_test) # Algorithm core_test = Core(agent, mdp) # Evaluate model dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # Policy epsilon = LinearParameter(value=args.initial_exploration_rate, threshold_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) class CategoricalLoss(nn.Module): def forward(self, input, target): input = input.clamp(1e-5) return -torch.sum(target * torch.log(input)) # Approximator approximator_params = dict( network=Network if args.algorithm not in ['dueldqn', 'cdqn'] else FeatureNetwork, input_shape=mdp.info.observation_space.shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, n_features=Network.n_features, optimizer=optimizer, loss=F.smooth_l1_loss if args.algorithm != 'cdqn' else CategoricalLoss(), use_cuda=args.use_cuda ) approximator = TorchApproximator if args.prioritized: replay_memory = PrioritizedReplayMemory( initial_replay_size, max_replay_size, alpha=.6, beta=LinearParameter(.4, threshold_value=1, n=max_steps // train_frequency) ) else: replay_memory = None # Agent algorithm_params = dict( batch_size=args.batch_size, target_update_frequency=target_update_frequency // train_frequency, replay_memory=replay_memory, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size ) if args.algorithm == 'dqn': agent = DQN(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(mdp.info, pi, approximator, approximator_params=approximator_params, n_approximators=args.n_approximators, **algorithm_params) elif args.algorithm == 'mmdqn': agent = MaxminDQN(mdp.info, pi, approximator, approximator_params=approximator_params, n_approximators=args.n_approximators, **algorithm_params) elif args.algorithm == 'dueldqn': agent = DuelingDQN(mdp.info, pi, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'cdqn': agent = CategoricalDQN(mdp.info, pi, approximator_params=approximator_params, n_atoms=args.n_atoms, v_min=args.v_min, v_max=args.v_max, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0, logger) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.save(folder_name + '/agent_0.msh') # Evaluate initial policy pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, logger)) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch, logger) logger.info('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh') logger.info('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, logger)) np.save(folder_name + '/scores.npy', scores) return scores
# Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) # Render core.evaluate(n_episodes=3, render=True) return np.mean(compute_J(dataset, mdp.info.gamma)) if __name__ == '__main__': n_experiment = 1 logger = Logger(FQI.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + FQI.__name__) Js = Parallel(n_jobs=-1)(delayed(experiment)() for _ in range(n_experiment)) logger.info((np.mean(Js)))
def experiment(n_epochs, n_steps, n_steps_per_fit, n_step_test): np.random.seed() logger = Logger(A2C.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + A2C.__name__) # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy policy_params = dict( n_features=32, use_cuda=False ) beta = Parameter(1e0) pi = BoltzmannTorchPolicy(Network, mdp.info.observation_space.shape, (mdp.info.action_space.n,), beta=beta, **policy_params) # Agent critic_params = dict(network=Network, optimizer={'class': optim.RMSprop, 'params': {'lr': 1e-3, 'eps': 1e-5}}, loss=F.mse_loss, n_features=32, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1,)) alg_params = dict(actor_optimizer={'class': optim.RMSprop, 'params': {'lr': 1e-3, 'eps': 3e-3}}, critic_params=critic_params, ent_coeff=0.01 ) agent = A2C(mdp.info, pi, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) # RUN dataset = core.evaluate(n_steps=n_step_test, render=False) J = compute_J(dataset, gamma_eval) logger.epoch_info(0, J=np.mean(J)) for n in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = compute_J(dataset, gamma_eval) logger.epoch_info(n+1, J=np.mean(J)) logger.info('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
def experiment(alg, n_epochs, n_steps, n_episodes_test): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP gamma = 0.99 habitat_root_path = Habitat.root_path() config_file = os.path.join( habitat_root_path, 'habitat_baselines/config/rearrange/rl_pick.yaml') base_config_file = os.path.join(habitat_root_path, 'configs/tasks/rearrange/pick.yaml') wrapper = 'HabitatRearrangeWrapper' mdp = Habitat(wrapper, config_file, base_config_file, gamma=gamma) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 100 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = actor_input_shape + mdp.info.action_space.shape critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=use_cuda) # Agent agent = alg(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) # RUN dataset = core.evaluate(n_episodes=n_episodes_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(0, J=J, R=R, entropy=E) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(n + 1, J=J, R=R, entropy=E) logger.info('Press a button to visualize the robot') input() core.evaluate(n_episodes=5, render=True)
def experiment(alg, n_epochs, n_steps, n_steps_test): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v1', horizon, gamma) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 100 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) critic_params = dict(network=CriticNetwork, optimizer={'class': optim.Adam, 'params': {'lr': 3e-4}}, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1,), use_cuda=use_cuda) # Agent agent = alg(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(0, J=J, R=R, entropy=E) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(n+1, J=J, R=R, entropy=E) logger.info('Press a button to visualize pendulum') input() core.evaluate(n_episodes=5, render=True)