def __init__(self, environment, learning_algo, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True): inputDims = environment.inputDimensions() if replay_start_size == None: replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): raise AgentError( "Replay_start_size should be greater than the biggest history of a state." ) self._controllers = [] self._environment = environment self._learning_algo = learning_algo self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size self._batch_size = batch_size self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history self._dataset = DataSet(environment, max_size=replay_memory_size, random_state=random_state, use_priority=self._exp_priority, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=float)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(learning_algo, environment.nActions(), random_state, 0.1) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(learning_algo, environment.nActions(), random_state, 0.) else: self._test_policy = test_policy self.gathering_data = True # Whether the agent is gathering data or not self.sticky_action = 1 # Number of times the agent is forced to take the same action as part of one actual time step
def __init__(self, environment, q_network, replay_memory_size=1000000, replay_start_size=None, batch_size=32, random_state=np.random.RandomState(), exp_priority=0, train_policy=None, test_policy=None, only_full_history=True): inputDims = environment.inputDimensions() if replay_start_size == None: replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): raise AgentError( "Replay_start_size should be greater than the biggest history of a state." ) self._controllers = [] self._environment = environment self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size self._batch_size = batch_size self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history self._dataset = DataSet(environment, max_size=replay_memory_size, random_state=random_state, use_priority=self._exp_priority, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.1) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(q_network, environment.nActions(), random_state, 0.) else: self._test_policy = test_policy
# --- Instantiate learning_algo --- learning_algo = CRAR(env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3) train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1) # --- Instantiate agent --- agent = NeuralAgent(env, learning_algo, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, train_policy=train_policy, test_policy=test_policy) # --- Create unique filename for FindBestController --- h = hash(vars(parameters), hash_name="sha1")
**vars(parameters)) if parameters.action_type == 'q_argmax': test_policy = ep.QArgmaxPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) train_policy = ep.QArgmaxPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) elif parameters.action_type == 'd_step_q_planning': test_policy = ep.MCPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start) train_policy = ep.MCPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start) elif parameters.action_type == 'bootstrap_q': test_policy = ep.BootstrapDQNPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) train_policy = ep.BootstrapDQNPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) elif parameters.action_type == 'd_step_reward_planning': test_policy = ep.MCRewardPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start) train_policy = ep.MCRewardPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start) else: test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start) # --- Instantiate agent --- # We might need to change this. train_q = parameters.action_type != 'd_step_reward_planning' agent = SEAgent( env, learning_algo, plotter, random_state=rng, train_policy=train_policy, test_policy=test_policy, train_q=train_q, **vars(parameters))
"value": rng.randint(9999) }, { "key": "color_averaging", "value": True }, { "key": "repeat_action_probability", "value": 0. }]) # --- Instantiate qnetwork --- qnetwork = MyQNetwork(env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng) test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05) # --- Instantiate agent --- agent = NeuralAgent(env, qnetwork, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, test_policy=test_policy) # --- Create unique filename for FindBestController --- h = hash(vars(parameters), hash_name="sha1") fname = "ALE_" + h print("The parameters hash is: {}".format(h))
from deer.policies import EpsilonGreedyPolicy rng = np.random.RandomState(123456) # TODO : best algorithm, hyperparameter tuning if args.network == 'DQN': network = MyQNetwork(environment=env, batch_size=32, double_Q=True, random_state=rng) elif args.network == 'DDPG': network = MyACNetwork(environment=env, batch_size=32, random_state=rng) agent = NeuralAgent(env, network, train_policy=EpsilonGreedyPolicy(network, env.nActions(), rng, 0.0), replay_memory_size=1000, batch_size=32, random_state=rng) #agent.attach(bc.VerboseController()) if args.fname == 'baseline': agent = EmpiricalTreatmentAgent(env) else: agent.setNetwork(args.fname) count = 0 length_success = [] avg_rad = [] avg_h_cell_killed = [] avg_percentage = []
args.learning_rate[2])) agent.attach( bc.InterleavedTestEpochController(epoch_length=1000, controllers_to_disable=[1, 2, 3, 4])) elif args.network == 'DDPG': network = MyACNetwork(environment=env, batch_size=32, double_Q=True, freeze_interval=args.epochs[1], random_state=rng) agent = NeuralAgent( env, network, train_policy=GaussianNoiseExplorationPolicy( network, env.nActions(), rng, .5) if args.exploration == 'gauss' else EpsilonGreedyPolicy(network, env.nActions(), rng, 0.1), replay_memory_size=min(args.epochs[0] * args.epochs[1] * 2, 100000), batch_size=32, random_state=rng) agent.setDiscountFactor(0.95) agent.attach(bc.FindBestController(validationID=0, unique_fname=args.fname)) agent.attach(bc.VerboseController()) agent.attach(bc.TrainerController()) if args.exploration == 'gauss': agent.attach( GaussianNoiseController(initial_std_dev=0.5, n_decays=args.epochs[0] * args.epochs[1], final_std_dev=0.005)) else: agent.attach(
def __init__(self, inputDims, q_network, actions, file='', replay_memory_size=Parameters.REPLAY_MEMORY_SIZE, replay_start_size=Parameters.BATCH_SIZE, batch_size=Parameters.BATCH_SIZE, random_state=np.random.RandomState(), exp_priority=0, batch_type='sequential', train_policy=None, test_policy=None, only_full_history=True, reward_as_input=False): CompleteLearner.__init__(self, actions, file) self.polfile = open(self.file + 'policy.txt', "w") # if replay_start_size == None: # replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) # elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): # raise AgentError("Replay_start_size should be greater than the biggest history of a state.") self._controllers = [] # --- Bind controllers to the agent --- # For comments, please refer to run_toy_env.py self.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1)) self.attach( bc.TrainerController(evaluate_on='action', periodicity=Parameters.UPDATE_FREQUENCY, show_episode_avg_V_value=True, show_avg_Bellman_residual=True)) self.attach( bc.LearningRateController( initial_learning_rate=Parameters.LEARNING_RATE, learning_rate_decay=Parameters.LEARNING_RATE_DECAY, periodicity=10000)) self.attach( bc.DiscountFactorController( initial_discount_factor=Parameters.DISCOUNT, discount_factor_growth=Parameters.DISCOUNT_INC, discount_factor_max=Parameters.DISCOUNT_MAX, periodicity=10000)) self.attach( bc.EpsilonController(initial_e=Parameters.EPSILON_START, e_decays=Parameters.EPSILON_DECAY, e_min=Parameters.EPSILON_MIN, evaluate_on='action', periodicity=1000, reset_every='none')) # self.attach(bc.InterleavedTestEpochController( # id=0, # epoch_length=Parameters.STEPS_PER_TEST, # controllers_to_disable=[0, 1, 2, 3, 4], # periodicity=2, # show_score=True, # summarize_every=-1)) self.obs = [] self.reward_as_input = reward_as_input self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size # make sure you gather this many observations before learning self._batch_size = batch_size self._batch_type = batch_type self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history #inputDimensions, n_actions, observation_type = np.float32, random_state = None, max_size = 1000, batch_type = 'random', only_full_history = True self._dataset = DataSet(inputDimensions=inputDims, n_actions=len(actions), max_size=replay_memory_size, random_state=random_state, batch_type=self._batch_type, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] #self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(q_network, len(actions), random_state, 0.05) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(q_network, len(actions), random_state, 0.) else: self._test_policy = test_policy self.initEpisode()
# --- Instantiate learning_algo --- learning_algo = CRAR( env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3) train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) # always takes random actions test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1) # random 1/10 times # --- Instantiate agent --- agent = NeuralAgent( env, learning_algo, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, train_policy=train_policy, test_policy=test_policy) # --- Create unique filename for FindBestController --- h = hash(vars(parameters), hash_name="sha1")