def __init__(self, state_size, action_size, seed, device, params): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.device = device self.params = params # Q-Network self.qnetwork_local = qn.QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = qn.QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=params['LR']) # Replay memory self.memory = rp.ReplayBuffer(action_size, params['BUFFER_SIZE'], params['BATCH_SIZE'], seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def test_loss_with_nonzero_reward_same_next_state_is_nonzero(self): input_shape = 2 batch_size = 1 num_actions = 4 num_hidden = 10 discount = 1 learning_rate = 1e-2 update_rule = 'sgd' freeze_interval = 1000 regularization = 0 rng = None num_hidden_layers = 1 network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) values = np.array( lasagne.layers.helper.get_all_param_values(network.l_out)) * 0 lasagne.layers.helper.set_all_param_values(network.l_out, values) lasagne.layers.helper.set_all_param_values(network.next_l_out, values) states = np.ones((1, 2), dtype=float) actions = np.zeros((1, 1), dtype='int32') rewards = np.ones((1, 1), dtype='int32') next_states = np.ones((1, 2), dtype=float) terminals = np.zeros((1, 1), dtype='int32') loss = network.train(states, actions, rewards, next_states, terminals) actual = loss expected = 0.5 self.assertEquals(actual, expected)
def test_loss_with_zero_reward_same_next_state_is_zero(self): # loss is still not zero because the selected action might not be the maximum value action input_shape = 2 batch_size = 1 num_actions = 4 num_hidden = 10 discount = 1 learning_rate = 1e-2 update_rule = 'sgd' freeze_interval = 1000 regularization = 0 rng = None num_hidden_layers = 1 network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) states = np.zeros((1, 2)) actions = np.zeros((1, 1), dtype='int32') rewards = np.zeros((1, 1)) next_states = np.zeros((1, 2)) terminals = np.zeros((1, 1), dtype='int32') loss = network.train(states, actions, rewards, next_states, terminals) actual = loss expected = 2 self.assertTrue(actual < expected)
def test_overfit_simple_artificial_dataset(self): input_shape = 1 batch_size = 10 num_actions = 2 num_hidden = 2 discount = 1 learning_rate = 1 update_rule = 'adam' freeze_interval = 100 regularization = 0 rng = None num_hidden_layers = 1 network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) rm = replay_memory.ReplayMemory(batch_size) # state 0 to state 1 reward +1 for idx in range(20): state = np.array([0]) next_state = np.array([1]) action = 1 reward = 1 terminal = 1 rm.store((state, action, reward, next_state, terminal)) # state 0 to state 0 reward -1 for idx in range(20): switch = random.randint(0, 1) state = np.array([0]) next_state = np.array([0]) action = 0 reward = -1 terminal = 0 rm.store((state, action, reward, next_state, terminal)) print rm.terminal_count print_data = False l = logger.Logger('test') counter = 0 while True: counter += 1 states, actions, rewards, next_states, terminals = rm.sample_batch( ) loss = network.train(states, actions, rewards, next_states, terminals) l.log_loss(loss) if counter % 100 == 0: l.log_epoch(counter) Q = {} s0 = network.get_q_values(np.array([0])) Q['s0_a0'] = s0[0] Q['s0_a1'] = s0[1] s1 = network.get_q_values(np.array([1])) Q['s1_a0'] = s1[0] Q['s1_a1'] = s1[1]
def simulate(): draw = False print 'building network...' if draw: pltAcas = plot_acas_xu.Plot_ACAS_XU(state_generator.RMAX, ICON_FILE, 1) sg = state_generator.StateGenerator(state_generator.RMAX, state_generator.RMIN, state_generator.VMIN, state_generator.VMAX, K_SIZE) q = qnetwork.QNetwork(state_generator.NUM_INPUTS, replay_memory.BATCH_SIZE, state_generator.NUM_ACTIONS, GAMMA, SOLVER) repMem = replay_memory.ReplayMemory() count = 0 dt = state_generator.DT dti = state_generator.DTI state = sg.randomStateGenerator() i = 0 print 'starting training...' while True: for j in range(TRAIN_FREQ): i += 1 action = q.getAction(state) nextStates, rewards = sg.getNextState(state, action, dt, dti) stateNorm, nextStateNorm = sg.normState(state, nextStates) repMem.store((stateNorm, action, rewards, nextStateNorm)) state = nextStates[0] count += 1 if draw: pltAcas.updateState(state, action) pltAcas.draw() time.sleep(0.3) if sg.checkRange(state) or i > 100: i = 0 state = sg.randomStateGenerator() if count % PRINT_FREQ == 0 and count >= replay_memory.INIT_SIZE: print "Samples: %d, Trainings: %d" % ( count, (count - replay_memory.INIT_SIZE) / TRAIN_FREQ), "Loss: %.3e" % q.test(repMem.sample_batch()) sys.stdout.flush() elif (count % 10000 == 0): print "Samples: %d" % count sys.stdout.flush() q.train(repMem.sample_batch())
def test_qnetwork_constructor_adam(self): input_shape = 2 batch_size = 100 num_actions = 4 num_hidden = 10 discount = 1 learning_rate = 1e-2 update_rule = 'adam' freeze_interval = 1000 regularization = 0 rng = None num_hidden_layers = 1 network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
def test_params_retrievable(self): input_shape = 2 batch_size = 100 num_actions = 4 num_hidden = 10 discount = 1 learning_rate = 1e-2 update_rule = 'sgd' freeze_interval = 1000 regularization = 0 rng = None num_hidden_layers = 1 network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) params = network.get_params() self.assertTrue(params is not None)
def test_agent(self): room_size = 5 mdp = mdps.MazeMDP(room_size, 1) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.1 discount = mdp.get_discount() num_actions = len(mdp.get_actions(None)) network = qnetwork.QNetwork(input_shape=2 * room_size, batch_size=1, num_actions=4, num_hidden=10, discount=discount, learning_rate=1e-3, update_rule='sgd', freeze_interval=10000, rng=None) p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, 10000) rm = replay_memory.ReplayMemory(1) log = logger.NeuralLogger(agent_name='QNetwork') adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter( room_size=room_size) a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, logger=log, state_adapter=adapter) num_epochs = 2 epoch_length = 10 test_epoch_length = 0 max_steps = 10 run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=False) e.run()
def test_that_initial_values_are_all_similar(self): input_shape = 2 batch_size = 100 num_actions = 4 num_hidden = 10 discount = 1 learning_rate = 1e-2 update_rule = 'sgd' freeze_interval = 1000 regularization = 0 rng = None num_hidden_layers = 1 network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) states = [[1, 1], [-1, -1], [-1, 1], [1, -1]] for state in states: q_values = network.get_q_values(state) self.assertTrue(max(abs(q_values)) < 2)
def test_that_q_values_are_retrievable(self): input_shape = 2 batch_size = 100 num_actions = 4 num_hidden = 10 discount = 1 learning_rate = 1e-2 update_rule = 'sgd' freeze_interval = 1000 regularization = 0 rng = None num_hidden_layers = 1 network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) state = np.array([1, 1]) q_values = network.get_q_values(state) actual = np.shape(q_values) expected = (num_actions, ) self.assertEquals(actual, expected)
mines = 5 # Training parameters batch_size = 100 # How many experiences to use for each training step. update_freq = 5 # How often to perform a training step. y = .9 # Discount factor on the target Q-values num_episodes = 100000 # How many episodes of game environment to train network with. pre_train_steps = 20000 # How many steps of random actions before training begins. max_epLength = 50 # The max allowed length of our episode. load_model = False # Whether to load a saved model. path = "../dqn" # The path to save our model to. tau = 0.0001 # Rate to update target network toward primary network # Start training tf.reset_default_graph() mainQN = qnetwork.QNetwork(field_size, num_actions) targetQN = qnetwork.QNetwork(field_size, num_actions) init = tf.global_variables_initializer() saver = tf.train.Saver() trainables = tf.trainable_variables() targetOps = qnetwork.update_target_graph(trainables, tau) myBuffer = qnetwork.ExperienceBuffer(20000) # Set the rate of random actions e = 0.005 # create lists to contain total rewards and steps per episode
action_size = 16 # Action set size state_size = 32 # State Size (a_t-1, o_t-1 ,......, a_t-M,o_t-M) learning_rate = 1e-2 # Learning rate gamma = 0.9 # Discount Factor hidden_size = 50 # Hidden Size (Put 200 for perfectly correlated) pretrain_length = 16 # Pretrain Set to be known n_episodes = 4 # Number of episodes (equivalent to epochs) #tf.reset_default_graph() tf.compat.v1.reset_default_graph() env_model = qnetwork.channel_env(NUM_CHANNELS) # Intialize Evironment, Network and Batch Memory q_network = qnetwork.QNetwork(learning_rate=learning_rate,state_size=state_size,action_size=NUM_CHANNELS,hidden_size=hidden_size,name="ChannelQ_Network") exp_memory = qnetwork.ExpMemory(in_size=memory_size) history_input = deque(maxlen=state_size) #Input as states # Initialise the state of 16 actions and observations with random initialisation for i in range(pretrain_length): action = np.random.choice(action_size) obs = data_in["channel"+str(action)][i] history_input.append(action) history_input.append(obs)
def run(learning_rate, freeze_interval, num_hidden, reg): room_size = 5 num_rooms = 2 mdp = mdps.MazeMDP(room_size, num_rooms) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.01 discount = 1 num_actions = len(mdp.get_actions(None)) batch_size = 100 print 'building network...' network = qnetwork.QNetwork(input_shape=2 * room_size + num_rooms**2, batch_size=batch_size, num_hidden_layers=2, num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate=learning_rate, regularization=reg, update_rule='adam', freeze_interval=freeze_interval, rng=None) num_epochs = 50 epoch_length = 2 test_epoch_length = 0 max_steps = 4 * (room_size * num_rooms)**2 epsilon_decay = (num_epochs * epoch_length * max_steps) / 1.5 print 'building policy...' p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, epsilon_decay) print 'building memory...' rm = replay_memory.ReplayMemory(batch_size, capacity=50000) print 'building logger...' log = logger.NeuralLogger(agent_name='QNetwork') print 'building state adapter...' adapter = state_adapters.CoordinatesToRowColRoomAdapter( room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToRowColAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToFlattenedGridAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.IdentityAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size) print 'building agent...' a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter) run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=True) e.run() ak = file_utils.load_key('../access_key.key') sk = file_utils.load_key('../secret_key.key') bucket = 'hierarchical' try: aws_util = aws_s3_utility.S3Utility(ak, sk, bucket) aws_util.upload_directory(e.agent.logger.log_dir) except Exception as e: print 'error uploading to s3: {}'.format(e)