def __init__(self, map_env): self.num_action = 6 # (left, right, stay_turn, speed up, speed down, stay_speed), turn 10 degrees self.map_env = map_env self.game = self.map_env.parent # Building the AI self.cnn = CNN(self.num_action) self.softmaxBody = SoftmaxBosy(T=1.0) self.ai = AI(brain=self.cnn, body=self.softmaxBody) # Setting up Experience Replay self.n_steps = experience_replay.NStepProgress(env=self.game, ai=self.ai, n_step=10) self.mem = experience_replay.ReplayMemory(n_steps=self.n_steps, capacity=10000) # movinfg average recorder of 100 self.ma = MA(100) # Training AI self.epoch = 1 self.loss = nn.MSELoss() self.ai.brain.load() self.pause = True
def init_model(self, config: Config, env, checkpoint: AlienGymCheckpoint = None) -> AlienGymAI: image_size: ImageSize = ImageSize.from_str(config.image_size) cnn = CNN(env.action_space.n, image_w=image_size.w, image_h=image_size.h) cnn.to(self.device) if checkpoint is not None: cnn.load_state_dict(checkpoint.model_state_dict) cnn.eval() body = SoftmaxBody(config.softmax_temp) body.to(self.device) optimizer = optim.Adam(cnn.parameters(), lr=config.optimizer_lr) if checkpoint is not None: optimizer.load_state_dict(checkpoint.optimizer_state_dict) ai = AI(brain=cnn, body=body, device=self.device) n_steps = experience_replay.NStepProgress(env=env, ai=ai, n_step=config.n_step) memory = experience_replay.ReplayMemory( n_steps=n_steps, capacity=config.memory_capacity) return AlienGymAI(cnn=cnn, ai=ai, loss=nn.MSELoss(), optimizer=optimizer, n_step=n_steps, replay_memory=memory)
#Getting number of actions from doom_enviroment number_actions = doom_env.action_space.n #Building an AI #Creating an object of our CNN class cnn = CNN(number_actions) #Creating an object of our SoftmaxBoddy class and inputing temperature softmax_body = SoftmaxBody(T=1.0) #Creating an object of our AI class and inputing the brain and body ai = AI(cnn, softmax_body) #Setting up Experiance Replay #10 step learning with a capacity of 10,000 n_steps = experience_replay.NStepProgress(env=doom_env, ai=ai, n_step=10) #Replay memory, create mini batches of 10 steps from 10,000 capacity memory = experience_replay.ReplayMemory(n_steps=n_steps, capacity=10000) #Implementing Elgibility Trace #n-step Q-learning (Not Asynchronous because we only have one agent) #AKA sarsa? #Training on batches def eligibility_trace(batch): gamma = 0.99 #Prediction inputs = [] #Target targets = [] #Going through the batch for series in batch:
"LEARN_FREQ" : 5, "TRAIN_EPISODE" : 2000 # 训练的总episode数 } if __name__ == "__main__": writer = SummaryWriter() env_name = "CartPole-v0" env = env.ContinuousCartPoleEnv() # env_name = "Pendulum-v0" # env = NormalizedActions(gym.make("Pendulum-v0")) # env_name = "Quadrotor" # env = make_env(env_name, task="hovering_control") logging.basicConfig(filename="{}.log".format(env_name)) # print("DQN trained on {}".format(env_name)) # logging.warning("DQN trained on {}".format(env_name)) # print(opt) # logging.warning(opt) act_dim = env.action_space.shape[0] obs_dim = env.observation_space.shape[0] rpm = experience_replay.ReplayMemory(opt["MEMORY_SIZE"]) agent = agent.DDPG_agent(obs_dim = obs_dim, act_dim = act_dim, actor_lr = opt["ACTOR_LR"], critic_lr = opt["CRITIC_LR"], tau = opt["TAU"], gamma = opt["GAMMA"]) # train(env, env_name, agent, opt["TRAIN_EPISODE"], rpm) agent.load("CartPole-v0.pth") evaluate(10, env, agent, render=True)
def __init__(self, available_actions_count, learning_rate=0.00025, discount_factor=0.99, epochs=20, hidden_nodes=4608, conv1_filters=32, conv2_filters=64, learning_steps_per_epoch=2000, replay_memory_size=10000, batch_size=64, test_episodes_per_epoch=2, frame_repeat=12, update_every=4, p_decay=0.95, e_start=1, reward_exploration=False, reward_shooting=False, resolution=(30, 45), sequence_length=10, observation_history=4, death_match=False, model_loadfile="/tmp/model.ckpt", model_savefile="/tmp/model.ckpt", start_from=0, save_model=True, load_model=False): self.learning_rate = learning_rate self.discount_factor = discount_factor self.epochs = epochs self.learning_steps_per_epoch = learning_steps_per_epoch self.replay_memory_size = replay_memory_size self.batch_size = batch_size self.test_episodes_per_epoch = test_episodes_per_epoch self.frame_repeat = frame_repeat self.p_decay = p_decay self.e_start = e_start self.resolution = resolution self.available_actions_count = available_actions_count self.model_savefile = model_savefile self.save_model = save_model self.load_model = load_model self.death_match = death_match self.reward_exploration = reward_exploration self.sequence_length = sequence_length self.observation_history = observation_history self.update_every = update_every self.start_from = start_from self.model_loadfile = model_loadfile self.reward_shooting = reward_shooting # Positions traversed during an episode self.positions = [] # Create replay memory which will store the transitions print("Creating replay memory") self.memory = er.ReplayMemory(capacity=replay_memory_size, resolution=resolution) # Start TF session print("Starting session") self.session = tf.Session() print("Creating model") # Create the input variables s1_ = tf.placeholder(tf.float32, [None] + list(self.resolution) + [1], name="State") a_ = tf.placeholder(tf.int32, [None], name="Action") target_q_ = tf.placeholder(tf.float32, [None, available_actions_count], name="TargetQ") # Add 2 convolutional layers with ReLu activation conv1 = tf.contrib.layers.convolution2d( s1_, num_outputs=conv1_filters, kernel_size=[6, 6], stride=[3, 3], activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer_conv2d(), biases_initializer=tf.constant_initializer(0.1)) conv2 = tf.contrib.layers.convolution2d( conv1, num_outputs=conv2_filters, kernel_size=[3, 3], stride=[2, 2], activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer_conv2d(), biases_initializer=tf.constant_initializer(0.1)) conv2_flat = tf.contrib.layers.flatten(conv2) #conv2_flat = tf.contrib.layers.DropoutLayer(conv2_flat, keep=0.5, name='dropout') fc1 = tf.contrib.layers.fully_connected( conv2_flat, num_outputs=hidden_nodes, activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.constant_initializer(0.1)) #fc1 = tf.contrib.layers.DropoutLayer(fc1, keep=0.5, name='dropout') #gru = tf.tensorlayer.RNNLayer(fc1, cell_fn=tf.nn.rnn_cell.GRUCell, n_hidden=128, n_steps=1, return_seq_2d=False) #gru = tf.contrib.layers.DropoutLayer(gru, keep=0.5, name='dropout') q = tf.contrib.layers.fully_connected( fc1, num_outputs=self.available_actions_count, activation_fn=None, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.constant_initializer(0.1)) best_a = tf.argmax(q, 1) loss = tf.contrib.losses.mean_squared_error(q, target_q_) optimizer = tf.train.RMSPropOptimizer(self.learning_rate) # Update the parameters according to the computed gradient using RMSProp. train_step = optimizer.minimize(loss) def function_learn(s1, target_q): feed_dict = {s1_: s1, target_q_: target_q} l, _ = self.session.run([loss, train_step], feed_dict=feed_dict) return l def function_get_q_values(state): return self.session.run(q, feed_dict={s1_: state}) def function_get_best_action(state): return self.session.run(best_a, feed_dict={s1_: state}) def function_simple_get_best_action(state): return function_get_best_action( state.reshape([1, self.resolution[0], self.resolution[1], 1]))[0] self.fn_learn = function_learn self.fn_get_q_values = function_get_q_values self.fn_get_best_action = function_simple_get_best_action print("Model created")