def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_mem.add_argument("--prioritized", action='store_true', help='Whether to use prioritized memory or not.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument( "--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='rmsprop', help='Name of the optimizer to use.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer.') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered and' 'rmsprop') arg_net.add_argument("--epsilon", type=float, default=1e-8, help='Epsilon term used in rmspropcentered and' 'rmsprop') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument( "--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "AveragedDQN or MaxminDQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of collected samples before each' 'evaluation. An epoch ends after this number of' 'steps') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of collected samples before each fit of' 'the neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of collected samples.') arg_alg.add_argument( "--final-exploration-frame", type=int, default=1000000, help='Number of collected samples until the exploration' 'rate stops decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of collected samples for each' 'evaluation.') arg_alg.add_argument( "--max-no-op-actions", type=int, default=30, help='Maximum number of no-op actions performed at the' 'beginning of the episodes.') arg_alg.add_argument("--n-atoms", type=int, default=51, help='Number of atoms for Categorical DQN.') arg_alg.add_argument("--v-min", type=int, default=-10, help='Minimum action-value for Categorical DQN.') arg_alg.add_argument("--v-max", type=int, default=10, help='Maximum action-value for Categorical DQN.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--use-cuda', action='store_true', help='Flag specifying whether to use the GPU.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() optimizer = dict() if args.optimizer == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon) elif args.optimizer == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon, centered=True) else: raise ValueError # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') pathlib.Path(folder_name).mkdir(parents=True) # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions) if args.load_path: # Agent agent = DQN.load(args.load_path) epsilon_test = Parameter(value=args.test_exploration_rate) agent.policy.set_epsilon(epsilon_test) # Algorithm core_test = Core(agent, mdp) # Evaluate model dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # Policy epsilon = LinearParameter(value=args.initial_exploration_rate, threshold_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) class CategoricalLoss(nn.Module): def forward(self, input, target): input = input.clamp(1e-5) return -torch.sum(target * torch.log(input)) # Approximator approximator_params = dict( network=Network if args.algorithm != 'cdqn' else FeatureNetwork, input_shape=mdp.info.observation_space.shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, n_features=Network.n_features, optimizer=optimizer, loss=F.smooth_l1_loss if args.algorithm != 'cdqn' else CategoricalLoss(), use_cuda=args.use_cuda) approximator = TorchApproximator if args.prioritized: replay_memory = PrioritizedReplayMemory( initial_replay_size, max_replay_size, alpha=.6, beta=LinearParameter(.4, threshold_value=1, n=max_steps // train_frequency)) else: replay_memory = None # Agent algorithm_params = dict( batch_size=args.batch_size, target_update_frequency=target_update_frequency // train_frequency, replay_memory=replay_memory, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size) if args.algorithm == 'dqn': agent = DQN(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(mdp.info, pi, approximator, approximator_params=approximator_params, n_approximators=args.n_approximators, **algorithm_params) elif args.algorithm == 'mmdqn': agent = MaxminDQN(mdp.info, pi, approximator, approximator_params=approximator_params, n_approximators=args.n_approximators, **algorithm_params) elif args.algorithm == 'cdqn': agent = CategoricalDQN(mdp.info, pi, approximator_params=approximator_params, n_atoms=args.n_atoms, v_min=args.v_min, v_max=args.v_max, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.save(folder_name + '/agent_0.msh') # Evaluate initial policy pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh') print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) return scores
# Opponent nme = Greedy(battle_format='gen8randombattle', start_timer_on_battle_start=True) # Training loop # Fill replay memory with random dataset print_epoch(0) mdp.start_battles(nme) # core.learn(n_steps=initial_replay_size, # n_steps_per_fit=initial_replay_size) core.learn(n_episodes=initial_replay_size, n_episodes_per_fit=initial_replay_size) mdp.end_battles() # Evaluate initial policy pi.set_epsilon(epsilon_test) # mdp.set_episode_end(False) mdp.start_battles(nme) dataset = core.evaluate(n_episodes=test_episodes) mdp.end_battles() scores.append(get_stats(dataset)) N_STEPS = max_steps // evaluation_frequency + 1 for n_epoch in range(1, N_STEPS): if n_epoch % 5 == 0 or n_epoch == N_STEPS-1: torch.save(core.agent.approximator.model.network, f'checkpoints/torch/checkpt_epoch{n_epoch}') core.agent.save(f'checkpoints/mushroom/checkpt_epoch{n_epoch}') print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon)
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_mem.add_argument("--prioritized", action='store_true', help='Whether to use prioritized memory or not.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument( "--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use.') arg_net.add_argument("--learning-rate", type=float, default=.0001, help='Learning rate value of the optimizer.') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered and' 'rmsprop') arg_net.add_argument("--epsilon", type=float, default=1e-8, help='Epsilon term used in rmspropcentered and' 'rmsprop') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=[ 'dqn', 'ddqn', 'adqn', 'mmdqn', 'cdqn', 'dueldqn', 'ndqn', 'rainbow' ], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument( "--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "AveragedDQN or MaxminDQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of collected samples before each' 'evaluation. An epoch ends after this number of' 'steps') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of collected samples before each fit of' 'the neural network.') arg_alg.add_argument("--max-steps", type=int, default=5000000, help='Total number of collected samples.') arg_alg.add_argument( "--final-exploration-frame", type=int, default=10000000, help='Number of collected samples until the exploration' 'rate stops decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-episodes", type=int, default=5, help='Number of episodes for each evaluation.') arg_alg.add_argument( "--alpha-coeff", type=float, default=.6, help='Prioritization exponent for prioritized experience replay.') arg_alg.add_argument("--n-atoms", type=int, default=51, help='Number of atoms for Categorical DQN.') arg_alg.add_argument("--v-min", type=int, default=-10, help='Minimum action-value for Categorical DQN.') arg_alg.add_argument("--v-max", type=int, default=10, help='Maximum action-value for Categorical DQN.') arg_alg.add_argument("--n-steps-return", type=int, default=3, help='Number of steps for n-step return for Rainbow.') arg_alg.add_argument("--sigma-coeff", type=float, default=.5, help='Sigma0 coefficient for noise initialization in' 'NoisyDQN and Rainbow.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--use-cuda', action='store_true', help='Flag specifying whether to use the GPU.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the grid.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() optimizer = dict() if args.optimizer == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon) elif args.optimizer == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon, centered=True) else: raise ValueError # Summary folder folder_name = './logs/habitat_nav_' + args.algorithm +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') pathlib.Path(folder_name).mkdir(parents=True) # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_episodes = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_episodes = args.test_episodes evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP config_file = os.path.join( pathlib.Path(__file__).parent.resolve(), 'pointnav_apartment-0.yaml') # Custom task for Replica scenes wrapper = 'HabitatNavigationWrapper' mdp = Habitat(wrapper, config_file) opt_return = mdp.env.get_optimal_policy_return() if args.load_path: logger = Logger(DQN.__name__, results_dir=None) logger.strong_line() logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return)) logger.info('Experiment Algorithm: ' + DQN.__name__) # Agent agent = DQN.load(args.load_path) epsilon_test = Parameter(value=args.test_exploration_rate) agent.policy.set_epsilon(epsilon_test) # Algorithm core_test = Core(agent, mdp) # Evaluate model dataset = core_test.evaluate(n_episodes=args.test_episodes, render=args.render, quiet=args.quiet) get_stats(dataset, logger) else: # Policy epsilon = LinearParameter(value=args.initial_exploration_rate, threshold_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator approximator_params = dict( network=Network if args.algorithm not in ['dueldqn', 'cdqn', 'ndqn', 'rainbow'] else FeatureNetwork, input_shape=mdp.info.observation_space.shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, n_features=Network.n_features, optimizer=optimizer, use_cuda=args.use_cuda) if args.algorithm not in ['cdqn', 'rainbow']: approximator_params['loss'] = F.smooth_l1_loss approximator = TorchApproximator if args.prioritized: replay_memory = PrioritizedReplayMemory( initial_replay_size, max_replay_size, alpha=args.alpha_coeff, beta=LinearParameter(.4, threshold_value=1, n=max_steps // train_frequency)) else: replay_memory = None # Agent algorithm_params = dict( batch_size=args.batch_size, target_update_frequency=target_update_frequency // train_frequency, replay_memory=replay_memory, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size) if args.algorithm == 'dqn': alg = DQN agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': alg = DoubleDQN agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': alg = AveragedDQN agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, n_approximators=args.n_approximators, **algorithm_params) elif args.algorithm == 'mmdqn': alg = MaxminDQN agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, n_approximators=args.n_approximators, **algorithm_params) elif args.algorithm == 'dueldqn': alg = DuelingDQN agent = alg(mdp.info, pi, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'cdqn': alg = CategoricalDQN agent = alg(mdp.info, pi, approximator_params=approximator_params, n_atoms=args.n_atoms, v_min=args.v_min, v_max=args.v_max, **algorithm_params) elif args.algorithm == 'ndqn': alg = NoisyDQN agent = alg(mdp.info, pi, approximator_params=approximator_params, sigma_coeff=args.sigma_coeff, **algorithm_params) elif args.algorithm == 'rainbow': alg = Rainbow beta = LinearParameter(.4, threshold_value=1, n=max_steps // train_frequency) agent = alg(mdp.info, pi, approximator_params=approximator_params, n_atoms=args.n_atoms, v_min=args.v_min, v_max=args.v_max, n_steps_return=args.n_steps_return, alpha_coeff=args.alpha_coeff, beta=beta, sigma_coeff=args.sigma_coeff, **algorithm_params) logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Optimal Policy Undiscounted Return: ' + str(opt_return)) logger.info('Experiment Algorithm: ' + alg.__name__) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0, logger) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.save(folder_name + '/agent_0.msh') # Evaluate initial policy pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_episodes=test_episodes, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, logger)) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch, logger) logger.info('- Learning:') # learning step pi.set_epsilon(epsilon) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.save(folder_name + '/agent_' + str(n_epoch) + '.msh') logger.info('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_episodes=test_episodes, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset, logger)) np.save(folder_name + '/scores.npy', scores) return scores
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearParameter(value=1., threshold_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, batch_size=batch_size, n_approximators=1, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, target_update_frequency=target_update_frequency) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in trange(n_epochs): tqdm.write('Epoch: ' + str(n)) pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) tqdm.write('J: ' + str(np.mean(J))) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = FQI(approximator, pi, mdp.info, n_iterations=20, approximator_params=approximator_params) core = Core(agent, mdp) core.learn(n_episodes=1000, n_episodes_per_fit=1000) pi.set_epsilon(Parameter(0.)) initial_state = np.array([[-.5, 0.]]) dataset = core.evaluate(initial_states=initial_state) print(compute_J(dataset, gamma=mdp.info.gamma))
def experiment(mdp, params, prob=None): # Argument parser # parser = argparse.ArgumentParser() # # args = parser.parse_args() scores = list() optimizer = dict() if params['optimizer'] == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=params['learning_rate']) elif params['optimizer'] == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=params['learning_rate']) elif params['optimizer'] == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=params['learning_rate'], alpha=params['decay']) elif params['optimizer'] == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=params['learning_rate'], alpha=params['decay'], centered=True) else: raise ValueError # DQN learning run # Summary folder folder_name = os.path.join(PROJECT_DIR, 'logs', params['name']) if params['save']: pathlib.Path(folder_name).mkdir(parents=True) # Policy epsilon = ExponentialParameter(value=params['initial_exploration_rate'], exp=params['exploration_rate'], min_value=params['final_exploration_rate'], size=(1, )) epsilon_random = Parameter(value=1) epsilon_test = Parameter(value=0.01) pi = EpsGreedy(epsilon=epsilon_random) class CategoricalLoss(nn.Module): def forward(self, input, target): input = input.clamp(1e-5) return -torch.sum(target * torch.log(input)) # Approximator input_shape = mdp.observation.shape resources = [[ (mdp.north - mdp.devices[device][0]) / (mdp.north - mdp.south), (mdp.east - mdp.devices[device][1]) / (mdp.east - mdp.west) ] for device in mdp.device_ordering] edges = [[ (mdp.north - mdp.graph.nodes[e[0]]['y']) / (mdp.north - mdp.south), (mdp.east - mdp.graph.nodes[e[0]]['x']) / (mdp.east - mdp.west), (mdp.north - mdp.graph.nodes[e[1]]['y']) / (mdp.north - mdp.south), (mdp.east - mdp.graph.nodes[e[1]]['x']) / (mdp.east - mdp.west) ] for e in mdp.graph.edges] N = { 'SimpleResourceNetwork': SimpleResourceNetwork, 'GraphConvolutionResourceNetwork': GraphConvolutionResourceNetwork, }[params['network']] N.n_features = params['hidden'] approximator_params = dict( network=N, input_shape=input_shape, edges=edges, resources=resources, graph=mdp.graph, allow_wait=params['allow_wait'], long_term_q=params['long_term_q'], resource_embeddings=params['resource_embeddings'], edge_ordering=mdp.edge_ordering, device_ordering=mdp.device_ordering, resource_edges=mdp.resource_edges, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, n_features=params['hidden'], optimizer=optimizer, loss=F.smooth_l1_loss, nn_scaling=params['nn_scaling'], # quiet=False, use_cuda=params['cuda'], load_path=params.get('load_path', None)) approximator = TorchApproximator replay_memory = PrioritizedReplayMemory( params['initial_replay_size'], params['max_replay_size'], alpha=.6, beta=LinearParameter(.4, threshold_value=1, n=params['max_steps'] // params['train_frequency'])) # Agent algorithm_params = dict( batch_size=params['batch_size'], n_approximators=1, target_update_frequency=params['target_update_frequency'] // params['train_frequency'], replay_memory=replay_memory, initial_replay_size=params['initial_replay_size'], max_replay_size=params['max_replay_size']) clz = DoubleDQN if mdp.info.gamma >= 1 else SMDPDQN agent = clz(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) lr_scheduler = torch.optim.lr_scheduler.StepLR( agent.approximator._impl.model._optimizer, step_size=1, gamma=params['lr_decay'], last_epoch=-1) # params['max_steps'] // params['train_frequency'] # Algorithm core = Core(agent, mdp) if 'weights' in params: best_weights = np.load(params['weights']) agent.approximator.set_weights(best_weights) agent.target_approximator.set_weights(best_weights) else: best_weights = agent.approximator.get_weights() # RUN pi.set_epsilon(epsilon_test) eval_days = [i for i in range(1, 356) if i % 13 == 1] ds = core.evaluate(initial_states=eval_days, quiet=tuning, render=params['save']) test_result = np.mean(compute_J(ds)) test_result_discounted = np.mean(compute_J(ds, params['gamma'])) print("discounted validation result", test_result_discounted) print("validation result", test_result) results = [(0, 0, test_result_discounted, test_result)] if params['save']: mdp.save_rendered(folder_name + "/epoch_init.mp4") # Fill replay memory with random dataset print_epoch(0) start = time() core.learn(n_steps=params['initial_replay_size'], n_steps_per_fit=params['initial_replay_size'], quiet=tuning) runtime = time() - start steps = 0 if params['save']: with open(folder_name + "/params.json", "w") as f: json.dump(params, f, indent=4) if isinstance(agent, DQN): np.save(folder_name + '/weights-exp-0-0.npy', agent.approximator.get_weights()) best_score = -np.inf no_improvement = 0 patience = 6 if params['save']: np.save(folder_name + '/scores.npy', scores) for n_epoch in range( 1, int(params['max_steps'] // params['evaluation_frequency'] + 1)): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) # mdp.set_episode_end(True) start = time() core.learn(n_steps=params['evaluation_frequency'], n_steps_per_fit=params['train_frequency'], quiet=tuning) runtime += time() - start steps += params['evaluation_frequency'] lr_scheduler.step() if params['save']: if isinstance(agent, DQN): np.save( folder_name + '/weights-exp-0-' + str(n_epoch) + '.npy', agent.approximator.get_weights()) print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) ds = core.evaluate(initial_states=eval_days, render=params['save'], quiet=tuning) test_result_discounted = np.mean(compute_J(ds, params['gamma'])) test_result = np.mean(compute_J(ds)) print("discounted validation result", test_result_discounted) print("validation result", test_result) if params['save']: mdp.save_rendered(folder_name + ("/epoch%04d.mp4" % n_epoch)) results.append((runtime, steps, test_result_discounted, test_result)) if params['save']: np.savetxt(folder_name + '/scores.csv', np.asarray(results), delimiter=',') if test_result > best_score: no_improvement = 0 best_score = test_result best_weights = agent.approximator.get_weights().copy() with open(folder_name + "/best_val.txt", "w") as f: f.write("%f" % test_result) else: no_improvement += 1 if no_improvement >= patience: break print('---------- FINAL EVALUATION ---------') agent.approximator.set_weights(best_weights) agent.target_approximator.set_weights(best_weights) pi.set_epsilon(epsilon_test) eval_days = [i for i in range(1, 356) if i % 13 == 0] ds = core.evaluate(initial_states=eval_days, render=params['save'], quiet=tuning) test_result_discounted = np.mean(compute_J(ds, params['gamma'])) test_result = np.mean(compute_J(ds)) print("discounted test result", test_result_discounted) print("test result", test_result) with open(folder_name + "/test_result.txt", "w") as f: f.write("%f" % test_result) if params['save']: mdp.save_rendered(folder_name + "/epoch_test.mp4", 10000) return scores