コード例 #1
0
    def __init__(self,
                 environment,
                 learning_algo,
                 replay_memory_size=1000000,
                 replay_start_size=None,
                 batch_size=32,
                 random_state=np.random.RandomState(),
                 exp_priority=0,
                 train_policy=None,
                 test_policy=None,
                 only_full_history=True):
        inputDims = environment.inputDimensions()

        if replay_start_size == None:
            replay_start_size = max(inputDims[i][0]
                                    for i in range(len(inputDims)))
        elif replay_start_size < max(inputDims[i][0]
                                     for i in range(len(inputDims))):
            raise AgentError(
                "Replay_start_size should be greater than the biggest history of a state."
            )

        self._controllers = []
        self._environment = environment
        self._learning_algo = learning_algo
        self._replay_memory_size = replay_memory_size
        self._replay_start_size = replay_start_size
        self._batch_size = batch_size
        self._random_state = random_state
        self._exp_priority = exp_priority
        self._only_full_history = only_full_history
        self._dataset = DataSet(environment,
                                max_size=replay_memory_size,
                                random_state=random_state,
                                use_priority=self._exp_priority,
                                only_full_history=self._only_full_history)
        self._tmp_dataset = None  # Will be created by startTesting() when necessary
        self._mode = -1
        self._total_mode_reward = 0
        self._training_loss_averages = []
        self._Vs_on_last_episode = []
        self._in_episode = False
        self._selected_action = -1
        self._state = []
        for i in range(len(inputDims)):
            self._state.append(np.zeros(inputDims[i], dtype=float))
        if (train_policy == None):
            self._train_policy = EpsilonGreedyPolicy(learning_algo,
                                                     environment.nActions(),
                                                     random_state, 0.1)
        else:
            self._train_policy = train_policy
        if (test_policy == None):
            self._test_policy = EpsilonGreedyPolicy(learning_algo,
                                                    environment.nActions(),
                                                    random_state, 0.)
        else:
            self._test_policy = test_policy
        self.gathering_data = True  # Whether the agent is gathering data or not
        self.sticky_action = 1  # Number of times the agent is forced to take the same action as part of one actual time step
コード例 #2
0
    def __init__(self,
                 environment,
                 q_network,
                 replay_memory_size=1000000,
                 replay_start_size=None,
                 batch_size=32,
                 random_state=np.random.RandomState(),
                 exp_priority=0,
                 train_policy=None,
                 test_policy=None,
                 only_full_history=True):
        inputDims = environment.inputDimensions()

        if replay_start_size == None:
            replay_start_size = max(inputDims[i][0]
                                    for i in range(len(inputDims)))
        elif replay_start_size < max(inputDims[i][0]
                                     for i in range(len(inputDims))):
            raise AgentError(
                "Replay_start_size should be greater than the biggest history of a state."
            )

        self._controllers = []
        self._environment = environment
        self._network = q_network
        self._replay_memory_size = replay_memory_size
        self._replay_start_size = replay_start_size
        self._batch_size = batch_size
        self._random_state = random_state
        self._exp_priority = exp_priority
        self._only_full_history = only_full_history
        self._dataset = DataSet(environment,
                                max_size=replay_memory_size,
                                random_state=random_state,
                                use_priority=self._exp_priority,
                                only_full_history=self._only_full_history)
        self._tmp_dataset = None  # Will be created by startTesting() when necessary
        self._mode = -1
        self._mode_epochs_length = 0
        self._total_mode_reward = 0
        self._training_loss_averages = []
        self._Vs_on_last_episode = []
        self._in_episode = False
        self._selected_action = -1
        self._state = []
        for i in range(len(inputDims)):
            self._state.append(np.zeros(inputDims[i], dtype=config.floatX))
        if (train_policy == None):
            self._train_policy = EpsilonGreedyPolicy(q_network,
                                                     environment.nActions(),
                                                     random_state, 0.1)
        else:
            self._train_policy = train_policy
        if (test_policy == None):
            self._test_policy = EpsilonGreedyPolicy(q_network,
                                                    environment.nActions(),
                                                    random_state, 0.)
        else:
            self._test_policy = test_policy
コード例 #3
0
    # --- Instantiate learning_algo ---
    learning_algo = CRAR(env,
                         parameters.rms_decay,
                         parameters.rms_epsilon,
                         parameters.momentum,
                         parameters.clip_norm,
                         parameters.freeze_interval,
                         parameters.batch_size,
                         parameters.update_rule,
                         rng,
                         double_Q=True,
                         high_int_dim=HIGH_INT_DIM,
                         internal_dim=3)

    train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.)
    test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1)

    # --- Instantiate agent ---
    agent = NeuralAgent(env,
                        learning_algo,
                        parameters.replay_memory_size,
                        max(env.inputDimensions()[i][0]
                            for i in range(len(env.inputDimensions()))),
                        parameters.batch_size,
                        rng,
                        train_policy=train_policy,
                        test_policy=test_policy)

    # --- Create unique filename for FindBestController ---
    h = hash(vars(parameters), hash_name="sha1")
コード例 #4
0
        **vars(parameters))

    if parameters.action_type == 'q_argmax':
        test_policy = ep.QArgmaxPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start)
        train_policy = ep.QArgmaxPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start)
    elif parameters.action_type == 'd_step_q_planning':
        test_policy = ep.MCPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start)
        train_policy = ep.MCPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start)
    elif parameters.action_type == 'bootstrap_q':
        test_policy = ep.BootstrapDQNPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start)
        train_policy = ep.BootstrapDQNPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start)
    elif parameters.action_type == 'd_step_reward_planning':
        test_policy = ep.MCRewardPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start)
        train_policy = ep.MCRewardPolicy(learning_algo, parameters.reward_type, env.nActions(), rng, depth=parameters.depth, epsilon_start=parameters.epsilon_start)
    else:
        test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start)
        train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, parameters.epsilon_start)


    # --- Instantiate agent ---
    # We might need to change this.
    train_q = parameters.action_type != 'd_step_reward_planning'
    agent = SEAgent(
        env,
        learning_algo,
        plotter,
        random_state=rng,
        train_policy=train_policy,
        test_policy=test_policy,
        train_q=train_q,
        **vars(parameters))
コード例 #5
0
                      "value": rng.randint(9999)
                  }, {
                      "key": "color_averaging",
                      "value": True
                  }, {
                      "key": "repeat_action_probability",
                      "value": 0.
                  }])

    # --- Instantiate qnetwork ---
    qnetwork = MyQNetwork(env, parameters.rms_decay, parameters.rms_epsilon,
                          parameters.momentum, parameters.clip_delta,
                          parameters.freeze_interval, parameters.batch_size,
                          parameters.update_rule, rng)

    test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)

    # --- Instantiate agent ---
    agent = NeuralAgent(env,
                        qnetwork,
                        parameters.replay_memory_size,
                        max(env.inputDimensions()[i][0]
                            for i in range(len(env.inputDimensions()))),
                        parameters.batch_size,
                        rng,
                        test_policy=test_policy)

    # --- Create unique filename for FindBestController ---
    h = hash(vars(parameters), hash_name="sha1")
    fname = "ALE_" + h
    print("The parameters hash is: {}".format(h))
コード例 #6
0
from deer.policies import EpsilonGreedyPolicy

rng = np.random.RandomState(123456)

# TODO : best algorithm, hyperparameter tuning
if args.network == 'DQN':
    network = MyQNetwork(environment=env,
                         batch_size=32,
                         double_Q=True,
                         random_state=rng)
elif args.network == 'DDPG':
    network = MyACNetwork(environment=env, batch_size=32, random_state=rng)

agent = NeuralAgent(env,
                    network,
                    train_policy=EpsilonGreedyPolicy(network, env.nActions(),
                                                     rng, 0.0),
                    replay_memory_size=1000,
                    batch_size=32,
                    random_state=rng)

#agent.attach(bc.VerboseController())
if args.fname == 'baseline':
    agent = EmpiricalTreatmentAgent(env)
else:
    agent.setNetwork(args.fname)

count = 0
length_success = []
avg_rad = []
avg_h_cell_killed = []
avg_percentage = []
コード例 #7
0
                                  args.learning_rate[2]))
    agent.attach(
        bc.InterleavedTestEpochController(epoch_length=1000,
                                          controllers_to_disable=[1, 2, 3, 4]))
elif args.network == 'DDPG':
    network = MyACNetwork(environment=env,
                          batch_size=32,
                          double_Q=True,
                          freeze_interval=args.epochs[1],
                          random_state=rng)
    agent = NeuralAgent(
        env,
        network,
        train_policy=GaussianNoiseExplorationPolicy(
            network, env.nActions(), rng, .5) if args.exploration == 'gauss'
        else EpsilonGreedyPolicy(network, env.nActions(), rng, 0.1),
        replay_memory_size=min(args.epochs[0] * args.epochs[1] * 2, 100000),
        batch_size=32,
        random_state=rng)
    agent.setDiscountFactor(0.95)
    agent.attach(bc.FindBestController(validationID=0,
                                       unique_fname=args.fname))
    agent.attach(bc.VerboseController())
    agent.attach(bc.TrainerController())
    if args.exploration == 'gauss':
        agent.attach(
            GaussianNoiseController(initial_std_dev=0.5,
                                    n_decays=args.epochs[0] * args.epochs[1],
                                    final_std_dev=0.005))
    else:
        agent.attach(
コード例 #8
0
    def __init__(self,
                 inputDims,
                 q_network,
                 actions,
                 file='',
                 replay_memory_size=Parameters.REPLAY_MEMORY_SIZE,
                 replay_start_size=Parameters.BATCH_SIZE,
                 batch_size=Parameters.BATCH_SIZE,
                 random_state=np.random.RandomState(),
                 exp_priority=0,
                 batch_type='sequential',
                 train_policy=None,
                 test_policy=None,
                 only_full_history=True,
                 reward_as_input=False):

        CompleteLearner.__init__(self, actions, file)
        self.polfile = open(self.file + 'policy.txt', "w")
        # if replay_start_size == None:
        #     replay_start_size = max(inputDims[i][0] for i in range(len(inputDims)))
        # elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))):
        #     raise AgentError("Replay_start_size should be greater than the biggest history of a state.")

        self._controllers = []

        # --- Bind controllers to the agent ---
        # For comments, please refer to run_toy_env.py
        self.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1))

        self.attach(
            bc.TrainerController(evaluate_on='action',
                                 periodicity=Parameters.UPDATE_FREQUENCY,
                                 show_episode_avg_V_value=True,
                                 show_avg_Bellman_residual=True))

        self.attach(
            bc.LearningRateController(
                initial_learning_rate=Parameters.LEARNING_RATE,
                learning_rate_decay=Parameters.LEARNING_RATE_DECAY,
                periodicity=10000))

        self.attach(
            bc.DiscountFactorController(
                initial_discount_factor=Parameters.DISCOUNT,
                discount_factor_growth=Parameters.DISCOUNT_INC,
                discount_factor_max=Parameters.DISCOUNT_MAX,
                periodicity=10000))

        self.attach(
            bc.EpsilonController(initial_e=Parameters.EPSILON_START,
                                 e_decays=Parameters.EPSILON_DECAY,
                                 e_min=Parameters.EPSILON_MIN,
                                 evaluate_on='action',
                                 periodicity=1000,
                                 reset_every='none'))

        # self.attach(bc.InterleavedTestEpochController(
        #     id=0,
        #     epoch_length=Parameters.STEPS_PER_TEST,
        #     controllers_to_disable=[0, 1, 2, 3, 4],
        #     periodicity=2,
        #     show_score=True,
        #     summarize_every=-1))

        self.obs = []
        self.reward_as_input = reward_as_input
        self._network = q_network
        self._replay_memory_size = replay_memory_size
        self._replay_start_size = replay_start_size  # make sure you gather this many observations before learning
        self._batch_size = batch_size
        self._batch_type = batch_type
        self._random_state = random_state
        self._exp_priority = exp_priority
        self._only_full_history = only_full_history
        #inputDimensions, n_actions, observation_type = np.float32, random_state = None, max_size = 1000, batch_type = 'random', only_full_history = True
        self._dataset = DataSet(inputDimensions=inputDims,
                                n_actions=len(actions),
                                max_size=replay_memory_size,
                                random_state=random_state,
                                batch_type=self._batch_type,
                                only_full_history=self._only_full_history)
        self._tmp_dataset = None  # Will be created by startTesting() when necessary
        self._mode = -1
        self._mode_epochs_length = 0
        self._total_mode_reward = 0
        self._training_loss_averages = []
        self._Vs_on_last_episode = []
        #self._in_episode = False

        self._selected_action = -1
        self._state = []
        for i in range(len(inputDims)):
            self._state.append(np.zeros(inputDims[i], dtype=config.floatX))
        if (train_policy == None):
            self._train_policy = EpsilonGreedyPolicy(q_network, len(actions),
                                                     random_state, 0.05)
        else:
            self._train_policy = train_policy
        if (test_policy == None):
            self._test_policy = EpsilonGreedyPolicy(q_network, len(actions),
                                                    random_state, 0.)
        else:
            self._test_policy = test_policy

        self.initEpisode()
コード例 #9
0
    # --- Instantiate learning_algo ---
    learning_algo = CRAR(
        env,
        parameters.rms_decay,
        parameters.rms_epsilon,
        parameters.momentum,
        parameters.clip_norm,
        parameters.freeze_interval,
        parameters.batch_size,
        parameters.update_rule,
        rng,
        double_Q=True,
        high_int_dim=HIGH_INT_DIM,
        internal_dim=3)
    
    train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) # always takes random actions
    test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1) # random 1/10 times

    # --- Instantiate agent ---
    agent = NeuralAgent(
        env,
        learning_algo,
        parameters.replay_memory_size,
        max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
        parameters.batch_size,
        rng,
        train_policy=train_policy,
        test_policy=test_policy)

    # --- Create unique filename for FindBestController ---
    h = hash(vars(parameters), hash_name="sha1")