Beispiel #1
0
    def __init__(self, state_size, action_size, seed, device, params):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.params = params

        # Q-Network
        self.qnetwork_local = qn.QNetwork(state_size, action_size,
                                          seed).to(device)
        self.qnetwork_target = qn.QNetwork(state_size, action_size,
                                           seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=params['LR'])

        # Replay memory
        self.memory = rp.ReplayBuffer(action_size, params['BUFFER_SIZE'],
                                      params['BATCH_SIZE'], seed, device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def test_loss_with_nonzero_reward_same_next_state_is_nonzero(self):
        input_shape = 2
        batch_size = 1
        num_actions = 4
        num_hidden = 10
        discount = 1
        learning_rate = 1e-2
        update_rule = 'sgd'
        freeze_interval = 1000
        regularization = 0
        rng = None
        num_hidden_layers = 1
        network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers,
                                    num_actions, num_hidden, discount,
                                    learning_rate, regularization, update_rule,
                                    freeze_interval, rng)

        values = np.array(
            lasagne.layers.helper.get_all_param_values(network.l_out)) * 0
        lasagne.layers.helper.set_all_param_values(network.l_out, values)
        lasagne.layers.helper.set_all_param_values(network.next_l_out, values)

        states = np.ones((1, 2), dtype=float)
        actions = np.zeros((1, 1), dtype='int32')
        rewards = np.ones((1, 1), dtype='int32')
        next_states = np.ones((1, 2), dtype=float)
        terminals = np.zeros((1, 1), dtype='int32')

        loss = network.train(states, actions, rewards, next_states, terminals)
        actual = loss
        expected = 0.5
        self.assertEquals(actual, expected)
    def test_loss_with_zero_reward_same_next_state_is_zero(self):
        # loss is still not zero because the selected action might not be the maximum value action
        input_shape = 2
        batch_size = 1
        num_actions = 4
        num_hidden = 10
        discount = 1
        learning_rate = 1e-2
        update_rule = 'sgd'
        freeze_interval = 1000
        regularization = 0
        rng = None
        num_hidden_layers = 1
        network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers,
                                    num_actions, num_hidden, discount,
                                    learning_rate, regularization, update_rule,
                                    freeze_interval, rng)

        states = np.zeros((1, 2))
        actions = np.zeros((1, 1), dtype='int32')
        rewards = np.zeros((1, 1))
        next_states = np.zeros((1, 2))
        terminals = np.zeros((1, 1), dtype='int32')

        loss = network.train(states, actions, rewards, next_states, terminals)
        actual = loss
        expected = 2
        self.assertTrue(actual < expected)
    def test_overfit_simple_artificial_dataset(self):
        input_shape = 1
        batch_size = 10
        num_actions = 2
        num_hidden = 2
        discount = 1
        learning_rate = 1
        update_rule = 'adam'
        freeze_interval = 100
        regularization = 0
        rng = None
        num_hidden_layers = 1
        network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers,
                                    num_actions, num_hidden, discount,
                                    learning_rate, regularization, update_rule,
                                    freeze_interval, rng)

        rm = replay_memory.ReplayMemory(batch_size)
        # state 0 to state 1 reward +1
        for idx in range(20):
            state = np.array([0])
            next_state = np.array([1])
            action = 1
            reward = 1
            terminal = 1
            rm.store((state, action, reward, next_state, terminal))

        # state 0 to state 0 reward -1
        for idx in range(20):
            switch = random.randint(0, 1)
            state = np.array([0])
            next_state = np.array([0])
            action = 0
            reward = -1
            terminal = 0
            rm.store((state, action, reward, next_state, terminal))

        print rm.terminal_count
        print_data = False
        l = logger.Logger('test')
        counter = 0
        while True:
            counter += 1
            states, actions, rewards, next_states, terminals = rm.sample_batch(
            )
            loss = network.train(states, actions, rewards, next_states,
                                 terminals)
            l.log_loss(loss)

            if counter % 100 == 0:
                l.log_epoch(counter)
                Q = {}
                s0 = network.get_q_values(np.array([0]))
                Q['s0_a0'] = s0[0]
                Q['s0_a1'] = s0[1]
                s1 = network.get_q_values(np.array([1]))
                Q['s1_a0'] = s1[0]
                Q['s1_a1'] = s1[1]
Beispiel #5
0
def simulate():

    draw = False
    print 'building network...'
    if draw:
        pltAcas = plot_acas_xu.Plot_ACAS_XU(state_generator.RMAX, ICON_FILE, 1)
    sg = state_generator.StateGenerator(state_generator.RMAX,
                                        state_generator.RMIN,
                                        state_generator.VMIN,
                                        state_generator.VMAX, K_SIZE)
    q = qnetwork.QNetwork(state_generator.NUM_INPUTS, replay_memory.BATCH_SIZE,
                          state_generator.NUM_ACTIONS, GAMMA, SOLVER)
    repMem = replay_memory.ReplayMemory()
    count = 0

    dt = state_generator.DT
    dti = state_generator.DTI
    state = sg.randomStateGenerator()
    i = 0
    print 'starting training...'
    while True:

        for j in range(TRAIN_FREQ):
            i += 1
            action = q.getAction(state)
            nextStates, rewards = sg.getNextState(state, action, dt, dti)
            stateNorm, nextStateNorm = sg.normState(state, nextStates)
            repMem.store((stateNorm, action, rewards, nextStateNorm))
            state = nextStates[0]
            count += 1
            if draw:
                pltAcas.updateState(state, action)
                pltAcas.draw()
                time.sleep(0.3)

            if sg.checkRange(state) or i > 100:
                i = 0
                state = sg.randomStateGenerator()

        if count % PRINT_FREQ == 0 and count >= replay_memory.INIT_SIZE:
            print "Samples: %d, Trainings: %d" % (
                count, (count - replay_memory.INIT_SIZE) /
                TRAIN_FREQ), "Loss: %.3e" % q.test(repMem.sample_batch())
            sys.stdout.flush()

        elif (count % 10000 == 0):
            print "Samples: %d" % count
            sys.stdout.flush()

        q.train(repMem.sample_batch())
 def test_qnetwork_constructor_adam(self):
     input_shape = 2
     batch_size = 100
     num_actions = 4
     num_hidden = 10
     discount = 1
     learning_rate = 1e-2
     update_rule = 'adam'
     freeze_interval = 1000
     regularization = 0
     rng = None
     num_hidden_layers = 1
     network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers,
                                 num_actions, num_hidden, discount,
                                 learning_rate, regularization, update_rule,
                                 freeze_interval, rng)
    def test_params_retrievable(self):
        input_shape = 2
        batch_size = 100
        num_actions = 4
        num_hidden = 10
        discount = 1
        learning_rate = 1e-2
        update_rule = 'sgd'
        freeze_interval = 1000
        regularization = 0
        rng = None
        num_hidden_layers = 1
        network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers,
                                    num_actions, num_hidden, discount,
                                    learning_rate, regularization, update_rule,
                                    freeze_interval, rng)

        params = network.get_params()
        self.assertTrue(params is not None)
Beispiel #8
0
 def test_agent(self):
     room_size = 5
     mdp = mdps.MazeMDP(room_size, 1)
     mdp.compute_states()
     mdp.EXIT_REWARD = 1
     mdp.MOVE_REWARD = -0.1
     discount = mdp.get_discount()
     num_actions = len(mdp.get_actions(None))
     network = qnetwork.QNetwork(input_shape=2 * room_size,
                                 batch_size=1,
                                 num_actions=4,
                                 num_hidden=10,
                                 discount=discount,
                                 learning_rate=1e-3,
                                 update_rule='sgd',
                                 freeze_interval=10000,
                                 rng=None)
     p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, 10000)
     rm = replay_memory.ReplayMemory(1)
     log = logger.NeuralLogger(agent_name='QNetwork')
     adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(
         room_size=room_size)
     a = agent.NeuralAgent(network=network,
                           policy=p,
                           replay_memory=rm,
                           logger=log,
                           state_adapter=adapter)
     num_epochs = 2
     epoch_length = 10
     test_epoch_length = 0
     max_steps = 10
     run_tests = False
     e = experiment.Experiment(mdp,
                               a,
                               num_epochs,
                               epoch_length,
                               test_epoch_length,
                               max_steps,
                               run_tests,
                               value_logging=False)
     e.run()
    def test_that_initial_values_are_all_similar(self):
        input_shape = 2
        batch_size = 100
        num_actions = 4
        num_hidden = 10
        discount = 1
        learning_rate = 1e-2
        update_rule = 'sgd'
        freeze_interval = 1000
        regularization = 0
        rng = None
        num_hidden_layers = 1
        network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers,
                                    num_actions, num_hidden, discount,
                                    learning_rate, regularization, update_rule,
                                    freeze_interval, rng)

        states = [[1, 1], [-1, -1], [-1, 1], [1, -1]]
        for state in states:
            q_values = network.get_q_values(state)
            self.assertTrue(max(abs(q_values)) < 2)
    def test_that_q_values_are_retrievable(self):
        input_shape = 2
        batch_size = 100
        num_actions = 4
        num_hidden = 10
        discount = 1
        learning_rate = 1e-2
        update_rule = 'sgd'
        freeze_interval = 1000
        regularization = 0
        rng = None
        num_hidden_layers = 1
        network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers,
                                    num_actions, num_hidden, discount,
                                    learning_rate, regularization, update_rule,
                                    freeze_interval, rng)

        state = np.array([1, 1])
        q_values = network.get_q_values(state)
        actual = np.shape(q_values)
        expected = (num_actions, )
        self.assertEquals(actual, expected)
Beispiel #11
0
mines = 5

# Training parameters
batch_size = 100  # How many experiences to use for each training step.
update_freq = 5  # How often to perform a training step.
y = .9  # Discount factor on the target Q-values
num_episodes = 100000  # How many episodes of game environment to train network with.
pre_train_steps = 20000  # How many steps of random actions before training begins.
max_epLength = 50  # The max allowed length of our episode.
load_model = False  # Whether to load a saved model.
path = "../dqn"  # The path to save our model to.
tau = 0.0001  # Rate to update target network toward primary network

# Start training
tf.reset_default_graph()
mainQN = qnetwork.QNetwork(field_size, num_actions)
targetQN = qnetwork.QNetwork(field_size, num_actions)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

trainables = tf.trainable_variables()
targetOps = qnetwork.update_target_graph(trainables, tau)

myBuffer = qnetwork.ExperienceBuffer(20000)

# Set the rate of random actions
e = 0.005

# create lists to contain total rewards and steps per episode
action_size = 16                # Action set size
state_size = 32                 # State Size (a_t-1, o_t-1 ,......, a_t-M,o_t-M)
learning_rate = 1e-2            # Learning rate
gamma = 0.9                     # Discount Factor
hidden_size = 50                # Hidden Size (Put 200 for perfectly correlated)
pretrain_length = 16            # Pretrain Set to be known
n_episodes = 4                 # Number of episodes (equivalent to epochs)


#tf.reset_default_graph()
tf.compat.v1.reset_default_graph()


env_model = qnetwork.channel_env(NUM_CHANNELS)      # Intialize Evironment, Network and Batch Memory

q_network = qnetwork.QNetwork(learning_rate=learning_rate,state_size=state_size,action_size=NUM_CHANNELS,hidden_size=hidden_size,name="ChannelQ_Network")

exp_memory = qnetwork.ExpMemory(in_size=memory_size)


history_input = deque(maxlen=state_size)            #Input as states


# Initialise the state of 16 actions and observations with random initialisation

for i in range(pretrain_length):
	action = np.random.choice(action_size)
	obs = data_in["channel"+str(action)][i]
	history_input.append(action)
	history_input.append(obs)
        def run(learning_rate, freeze_interval, num_hidden, reg):
            room_size = 5
            num_rooms = 2
            mdp = mdps.MazeMDP(room_size, num_rooms)
            mdp.compute_states()
            mdp.EXIT_REWARD = 1
            mdp.MOVE_REWARD = -0.01
            discount = 1
            num_actions = len(mdp.get_actions(None))
            batch_size = 100
            print 'building network...'
            network = qnetwork.QNetwork(input_shape=2 * room_size +
                                        num_rooms**2,
                                        batch_size=batch_size,
                                        num_hidden_layers=2,
                                        num_actions=4,
                                        num_hidden=num_hidden,
                                        discount=discount,
                                        learning_rate=learning_rate,
                                        regularization=reg,
                                        update_rule='adam',
                                        freeze_interval=freeze_interval,
                                        rng=None)
            num_epochs = 50
            epoch_length = 2
            test_epoch_length = 0
            max_steps = 4 * (room_size * num_rooms)**2
            epsilon_decay = (num_epochs * epoch_length * max_steps) / 1.5
            print 'building policy...'
            p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, epsilon_decay)
            print 'building memory...'
            rm = replay_memory.ReplayMemory(batch_size, capacity=50000)
            print 'building logger...'
            log = logger.NeuralLogger(agent_name='QNetwork')
            print 'building state adapter...'
            adapter = state_adapters.CoordinatesToRowColRoomAdapter(
                room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToRowColAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToFlattenedGridAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.IdentityAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size)
            print 'building agent...'
            a = agent.NeuralAgent(network=network,
                                  policy=p,
                                  replay_memory=rm,
                                  log=log,
                                  state_adapter=adapter)
            run_tests = False
            e = experiment.Experiment(mdp,
                                      a,
                                      num_epochs,
                                      epoch_length,
                                      test_epoch_length,
                                      max_steps,
                                      run_tests,
                                      value_logging=True)
            e.run()

            ak = file_utils.load_key('../access_key.key')
            sk = file_utils.load_key('../secret_key.key')
            bucket = 'hierarchical'
            try:
                aws_util = aws_s3_utility.S3Utility(ak, sk, bucket)
                aws_util.upload_directory(e.agent.logger.log_dir)
            except Exception as e:
                print 'error uploading to s3: {}'.format(e)