def __init__(self, gamma, epsilon, lr, n_actions, input_dims, memory_size, batch_size, algo, env_name, checkpoint_dir, epsilon_min=0.01, epsilon_decay=5e-7, replace_target_count=1000): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.memory_size = memory_size self.batch_size = batch_size self.algo = algo self.env_name = env_name self.epsilon_min = 0.01 self.epsilon_decay = epsilon_decay self.replace_target_count = replace_target_count self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayMemory(memory_size, input_dims, n_actions) self.q_net = DeepQNetwork(self.lr, self.n_actions, name=self.env_name+'_'+self.algo+'_q_net', input_dims=self.input_dims, checkpoint_dir=self.checkpoint_dir) self.target_net = DeepQNetwork(self.lr, self.n_actions, name=self.env_name+'_'+self.algo+'_target_net', input_dims=self.input_dims, checkpoint_dir=self.checkpoint_dir)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_next', chkpt_dir=self.chkpt_dir)
def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_next', chkpt_dir=self.chkpt_dir)
def crossover(self, crossover_mode, selection_mode): print("Crossver") #2. Fitness sum_fitnesses = np.sum(self.old_fitnesses) probs = [ self.old_fitnesses[i] / sum_fitnesses for i in range(self.size) ] # Sorting descending NNs according to their fitnesses #3. Parents selection sort_indices = np.argsort(probs)[::-1] for i in range(self.size): if i < self.size * elitism_pct: # Add the top performing childs - parents selection model_c = self.old_models[sort_indices[i]] else: #selekcja rankingowa if selection_mode == "ranking": a = sort_indices[0] b = sort_indices[1] # sum_parent = self.old_fitnesses[a] + self.old_fitnesses[b] model_a, model_b = self.old_models[a], self.old_models[b] model_c = DeepQNetwork() conv_a = [model_a.conv1, model_a.conv2, model_a.conv3] conv_b = [model_b.conv1, model_b.conv2, model_b.conv3] conv_c = [model_c.conv1, model_c.conv2, model_c.conv3] c_i = 0 for c in conv_c: #4. Crossover for i in range(c[0].weight.size()[0]): for j in range(c[0].weight.size()[1]): if crossover_mode == "mean": c[0].weight.data[i][ j] = conv_b[c_i][0].weight.data[i][ j] / conv_a[c_i][0].weight.data[i][j] if crossover_mode == "two_point": point_one = np.random.random() point_two = np.random.random() if point_one > point_two: a = point_one point_one = point_two point_two = a c[0].weight.data[0:point_one][j] = conv_b[c_i][ 0].weight.data[0:point_one][j] c[0].weight.data[point_one:point_two][ j] = conv_c[c_i][0].weight.data[ point_one:point_two][j] c[0].weight.data[point_two:][j] = conv_b[c_i][ 0].weight.data[point_two:][j] c_i += 1 self.models.append(model_c)
def run_dqn(config, gym_wrapper, summaries_collector_traj, summaries_collector): q_network = DeepQNetwork(config, gym_wrapper, trajectory=1) initial_time = round(time(), 3) q_network.train(summaries_collector) reward = q_network.test(summaries_collector, episodes=10, render=True) summaries_collector.read_summaries('test') total_time_traj = round(time(), 3) - initial_time print("tested avg reward: {0} in: {1}".format(reward, total_time_traj))
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, chkpt_dir, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None): self.gamma = gamma # 0.99 self.epsilon = epsilon # 1.0 self.lr = lr # 0.0001 self.n_actions = n_actions # 6 self.input_dims = input_dims # (4, 84, 84) self.batch_size = batch_size # 32 self.eps_min = eps_min # 0.1 self.eps_dec = eps_dec # 1e-05 self.replace_target_cnt = replace # 1000 self.algo = algo # 'DQNAgent' self.env_name = env_name # 'PongNoFrameskip-v4' self.chkpt_dir = chkpt_dir # .\\models\\ self.action_space = [i for i in range(self.n_actions) ] # [0, 1, 2, 3, 4, 5] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir)
def __init__(self, size=50, old_population=None, crossover_mode="mean", selection_mode="ranking"): self.size = size if old_population is None: self.models = [DeepQNetwork() for i in range(size)] else: #1. Population self.old_models = old_population.models self.old_fitnesses = old_population.fitnesses self.models = [] self.crossover_mode = crossover_mode self.selection_mode = selection_mode self.crossover(crossover_mode, selection_mode) self.mutate() self.fitnesses = np.zeros(self.size)
gamma = 0.999 # Is the discount factor used in the Bellman equation eps_start = SaveLoadModule.get_epsilon_start_point( ) # Starting value of epsilon eps_end = 0.2 # Ending value of epsilon eps_decay = 0.0001 # Decay rate we’ll use to decay epsilon over time target_update = 10 # How frequently, in terms of episodes, we’ll update the target network weights with the policy network weights. memory_size = 300 # Capacity of the replay memory lr = 0.001 # Learning rate num_episodes = 500 # Number of episodes we want to play last_training_episode = SaveLoadModule.get_most_advanced_episode() environment_manager = EnvManager('SpaceInvaders-v0') strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay) agent = Agent(strategy, environment_manager.num_actions_available()) memory = ReplayMemory(memory_size) policy_net = DeepQNetwork( input_shape=(environment_manager.get_input_shape(), ), action_space=environment_manager.num_actions_available(), batch_size=batch_size) target_net = DeepQNetwork( input_shape=(environment_manager.get_input_shape(), ), action_space=environment_manager.num_actions_available(), batch_size=batch_size) max_reward = 0 # Episode loop for episode in range(last_training_episode, num_episodes): max_episode_reward = 0 environment_manager.reset() state = environment_manager.get_state() environment_manager.done = False # Steps loop
# swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze(arg.mazeSize, arg.mazeSize) RL = DeepQNetwork(n_actions=len(env.action_space), n_features=len(env.position), #env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, hidden_layers=[10, 10], replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
def runner(node_num): # Load checkpoint load_path = "weights/weights.ckpt" save_path = "weights/weights.ckpt" # set seed seed = 42 np.random.seed(seed) random.seed(seed) # Generate graph for training... resources = 1 # G, reward_save, num_nodes = generate_graph(nodes=node_num, type='gnp_adversarial') # G, reward_save, num_nodes = generate_graph(load_dir='../gml/ibm.gml', type='gml') G, reward_save, num_nodes = generate_graph(nodes=node_num, type='random_graph', seed=42) # Pick an arbitrary node to be the root root = 0 # Try plotting. If on ssh, don't bother since there are some necessary plt.draw() commands # to plot a networkx graph. try: plot_graph(G, root, 'rl_graph.png') except: print('No display') # We may want to include the graph laplacian in the observation space # Graph laplacian is D - A # laplacian_matrix = nx.laplacian_matrix(G).toarray() # flat_laplacian = laplacian_matrix.flatten() # Build the learning environment env = environment(G, [root], resources) print('num_edges:', G.number_of_edges()) print("Ratio Heuristic", ratio_heuristic(G, [root], resources), '\n') # Our observation space n_y = len(env.actions_permutations) # Initialize DQN DQN = DeepQNetwork( n_y=n_y, n_x=num_nodes, resources=resources, env=env, learning_rate=0.01, replace_target_iter=20, memory_size=20000, batch_size=256, reward_decay=0.6, epsilon_min=0.1, epsilon_greedy_decrement=5e-5, # load_path=load_path, # save_path=save_path, # laplacian=flat_laplacian, inner_act_func='leaky_relu', output_act_func='leaky_relu') episodes = 600 rewards = [] total_steps_counter = 0 episodes_since_max = 0 optimal_action_sequences = [] overall_start = time.time() # DQN.epsilon = 0.5 for episode in range(episodes): observation, done = env.reset() episode_reward = 0 action_sequence = [] start = time.time() train_time = 0 while not done: # 1. Choose an action based on observation action = DQN.choose_action(observation) # check for random action if action == -1: # action = env.random_action() # now choose between truly random action and a ratio action r = random.random() if r < 0.6: action = env.random_action() else: action = env.ratio_action() # save the taken action action_sequence.append(action) # print('Chosen action', action) # 2. Take the chosen action in the environment observation_, reward, done = env.step(action, neg=False) # print(observation_, reward, done) # 3. Store transition DQN.store_transition(observation, action, reward, observation_) episode_reward += reward if total_steps_counter > 2000: # 4. Train s = time.time() DQN.learn() e = time.time() train_time += (e - s) if done: rewards.append(episode_reward) max_reward_so_far = np.amax(rewards) # if maximum reward so far, save the action sequence if episode_reward == max_reward_so_far: optimal_action_sequences.append( (action_sequence, episode_reward)) episodes_since_max = 0 # DQN.epsilon = 1 print("==========================================") print("Episode: ", episode) print("Reward: ", round(episode_reward, 2)) print("Epsilon: ", round(DQN.epsilon, 2)) print("Max reward so far: ", max_reward_so_far) end = time.time() print('Episode time:', end - start) start = time.time() break # Save observation observation = observation_ # Increase total steps total_steps_counter += 1 # if episode == 700: # DQN.epsilon_min = .1 # DQN.epsilon = 0.5 episodes_since_max += 1 print('train time across episode', train_time) overall_end = time.time() # TEST Q-Learning DQN.epsilon = 0 DQN.epsilon_min = 0 observation, done = env.reset() final_reward = 0 action_sequence = [] while not done: action = DQN.choose_action(observation) action_sequence.append(action) observation_, reward, done = env.step(action, neg=False) final_reward += reward if done: rewards.append(final_reward) max_reward_so_far = np.amax(rewards) # if maximum reward so far, save the action sequence if final_reward == max_reward_so_far: optimal_action_sequences.append( (action_sequence, final_reward)) episodes_since_max = 0 break # Save observation observation = observation_ print('final epsilon=0 reward', final_reward, '\n') # TESTING # convert our 'best' optimal action sequence to the vector representation, test it for correctness opt = optimal_action_sequences[len(optimal_action_sequences) - 1][0] reward = optimal_action_sequences[len(optimal_action_sequences) - 1][1] print() # print('RL action sequence:') env.reset() true_r = 0 for action in opt: # print('action index', action) # debug will print the action at each step as a vector _, r, d = env.step(action, debug=True) true_r += r results = [] # if we have a reasonable number of nodes (< 24), we can compute optimal using DP if num_nodes < 24: dp_time = time.time() results.append(DP_optimal(G, [root], resources)) print('DP Opt: ', results[0]) dp_time_end = time.time() results.append(dp_time_end - dp_time) print('DP time: ', results[1]) else: results.append('n/a') results.append('n/a') print('\n Random Heuristic', random_heuristic(G, [root], resources), '\n') results.append(random_heuristic(G, [root], resources)) # Only works on trees # print('\n Tree Heuristic:', simulate_tree_recovery(G, resources, root, clean=False), '\n') ratio_time_start = time.time() print('\n Ratio Heuristic', ratio_heuristic(G, [root], resources)) ratio_time_end = time.time() print('Ratio time:', ratio_time_end - ratio_time_start) results.append(ratio_heuristic(G, [root], resources)) results.append(ratio_time_end - ratio_time_start) print('\n reward during training:', reward) results.append(reward) print('RL method time (s): ', overall_end - overall_start, '\n') results.append(overall_end - overall_start) plot_bar_x(rewards, 'episode', 'reward_graph.png') with open(reward_save, 'w') as f: for item in rewards: f.write('%s\n' % item) return results
if __name__ == '__main__': LR = 2.5e-4 HEIGHT = 84 WIDTH = 84 NUM_FRAMES = 4 TEST_EPISODE = 5 par_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) MODEL_DIR = os.path.join(par_dir, 'model') model_file = os.listdir(MODEL_DIR)[-1] # 最后保存的模型 env = make_atari('PongNoFrameskip-v4') env = wrap_deepmind(env, scale=False, frame_stack=True) num_actions = env.action_space.n dqn = DeepQNetwork(input_shape=(WIDTH, HEIGHT, NUM_FRAMES), num_actions=num_actions, name='dqn', learning_rate=LR) dqn.load(MODEL_DIR, model_file) ep_reward = [] for _ in range(TEST_EPISODE): frame = env.reset() # LazyFrames state = np.array(frame) # narray (84, 84, 4) done = False cur_episode_reward = 0 while not done: # 如果done则结束episode action = dqn.get_action(state / 255.0) env.render() next_frame, reward, done, _ = env.step(action) state = np.array(next_frame) cur_episode_reward += reward time.sleep(0.005)
test_dataset = MNIST('./data', train=False, transform=img_transform, download=True) # print(len(test_dataset)) test_1, test_2 = random_split(test_dataset, [2000, 8000]) test_dataloader = DataLoader( test_1, # batch_size=BATCH_SIZE, shuffle=True) #=========================================================== cnn = CNN().cuda() cnn.load_state_dict( torch.load("/home/user/liuhongxing/Mnist_RL/cnn_mnist.pth")) Q = DeepQNetwork() Q.load_state_dict( torch.load( '/home/user/liuhongxing/Mnist_RL/Q_network_exploitation_600_act5.pth')) optimizer = torch.optim.Adam(Q.parameters(), lr=1e-3) criterion = nn.MSELoss() threshold = 0.1 iterations = 70 gamma = 0.9 # writer = SummaryWriter('./log/Q_network_600_act5') # # training and validation # print('start Q_network_600_act5 training----------------------') # for st, data in enumerate(train_dataloader): # imgs, labels = data # D, pre_label, _ = genenrate_D(imgs,labels) # 生成图片与聚类中心距离D
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size) model = DeepQNetwork() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() state = env.reset() if torch.cuda.is_available(): model.cuda() state = state.cuda() replay_memory = deque(maxlen=opt.replay_memory_size) epoch = 0 while epoch < opt.num_epochs: next_steps = env.get_next_states() # Exploration or exploitation epsilon = opt.final_epsilon + ( max(opt.num_decay_epochs - epoch, 0) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs) u = random() random_action = u <= epsilon next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) if torch.cuda.is_available(): next_states = next_states.cuda() model.eval() with torch.no_grad(): predictions = model(next_states)[:, 0] model.train() # if random_action: # index = randint(0, len(next_steps) - 1) # else: index = torch.argmax(predictions).item() next_state = next_states[index, :] action = next_actions[index] reward, done = env.step(action, render=True) if torch.cuda.is_available(): next_state = next_state.cuda() replay_memory.append([state, reward, next_state, done]) if done: final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines state = env.reset() if torch.cuda.is_available(): state = state.cuda() else: state = next_state continue if len(replay_memory) < opt.replay_memory_size / 10: continue epoch += 1 batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, reward_batch, next_state_batch, done_batch = zip(*batch) state_batch = torch.stack(tuple(state for state in state_batch)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.stack( tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model(state_batch) model.eval() with torch.no_grad(): next_prediction_batch = model(next_state_batch) model.train() y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( reward_batch, done_batch, next_prediction_batch)))[:, None] optimizer.zero_grad() loss = criterion(q_values, y_batch) loss.backward() optimizer.step() print( "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) if epoch > 0 and epoch % opt.save_interval == 0: torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch)) torch.save(model, "{}/tetris2".format(opt.saved_path))
from config_utils import read_main_config from deep_q_network import DeepQNetwork from gym_wrapper import GymWrapper from tensorflow.python.framework.ops import disable_eager_execution disable_eager_execution() config = read_main_config() gym_wrapper = GymWrapper(config['general']['scenario']) deep_q_network = DeepQNetwork(config, gym_wrapper) deep_q_network.train() deep_q_network.test(episodes=3)
def train(opt): '''This function is for the training ''' if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda torch.cuda.manual_seed( 125 ) # the torch cuda manual seed is used in order to have reproducable results else: torch.manual_seed(125) # sets the random number generator from pytorch if os.path.isdir( opt.log_path): # check if the path is the path that is stored shutil.rmtree( opt.log_path) # delets all the content from the lo_path dirfectory os.makedirs(opt.log_path ) # create a new path directory and store it to the log_path new_writer2 = SummaryWriter( opt.log_path) # create a new summary writer with the log path environment = Tetris( width=opt.width, height=opt.height, block_size=opt.block_size ) # sets the environment to the tetris environment that i have created before with with width, the height and the the block size from the parser. deepQ_model = DeepQNetwork( ) # the model is set to the deep q network that was created before my_optim = torch.optim.Adam( deepQ_model.parameters(), lr=opt.lr ) # sets the optimizaer with the algorith Adamn and the deep q model paramets and the learning rate from the parser cn = nn.MSELoss( ) # this is the default as ((input-target)**2).mean() but with pytorch it gets easier state = environment.reset( ) # the state is equal to a new reset environment if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda deepQ_model.cuda( ) # sets the .cuda() to the deep q learning model to keep track of the gpu state = state.cuda( ) # sets the .cuda() to the state to keep track of the gpu r_memory = deque( maxlen=opt.mem_size ) #adds the removed elements to the r_memory. In that case the removed element is the memory sizy from the parser epoch = 0 #the epoch is set to 0 output_training_video = cv2.VideoWriter( opt.result, cv2.VideoWriter_fourcc(*'FMP4'), opt.fps, (int(1.5 * opt.width * opt.block_size), opt.height * opt.block_size)) while epoch < opt.num_epochs: # loops until the epoch is less than the number of epochs from the parser next_steps = environment.get_next_states( ) # the next steps are set to the environment next states epsilon = opt.finalEpsilon + ( max(opt.decay_epochs - epoch, 0) * (opt.initialEpsilon - opt.finalEpsilon) / opt.decay_epochs ) # this is for exploration. The epsilon is the final epsilon value from the parser + the max decay epochs - epoch and 0 * with the initial epsilon from the parser - the final epsilon / by the number of decay epochs. pp = random() # pp is a random rand_action = pp <= epsilon # random action is equal to the pp less than the epsilon nextActions, next_states = zip( *next_steps.items() ) # next action and next states are equal to a series of tuples of the next steps next_states = torch.stack( next_states ) # next states are set to the cocatenates of the next states to a new dimension if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda next_states = next_states.cuda( ) # sets the .cuda() to the next states to keep track of the gpu deepQ_model.eval( ) # this pytorch function sets the model to evaluation mode while testing with torch.no_grad( ): # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up dqm_p = deepQ_model( next_states )[:, 0] # press is set to the deepq model with the next states deepQ_model.train() # trains the deep q model if rand_action: # if the action is random idx = randint( 0, len(next_steps) - 1 ) # the index is set to the random of the length of the next steps -1 else: idx = torch.argmax( dqm_p).item() #index set the maximum values of dqm_p next_state = next_states[ idx, :] # the next state is equal to the next states index action = nextActions[idx] #action is set the next actions index reward, done = environment.make_step( action, cv2_rend=True ) # the reword and done is set to the environment with the action and the open cv render which is the environment for visualization if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda next_state = next_state.cuda( ) # sets the .cuda() to the next state to keep track of the gpu r_memory.append([ state, reward, next_state, done ]) # appends the r memory with the state reward next state and done if done: # if its done output_training_video.release() episode_durations.append(epoch + 1) #plot_durations() final_total_score = environment.player_score # the final total score is equal to the environments players score tot_reward.append(final_total_score) plot_reward() final_total_blocks = environment.tetris_blocks # the final total blocks are equal to the environments tetris blocks final_total_completed_lines = environment.completed_lines # the final total completed lines are equal to the environments completed lines state = environment.reset( ) # state is equal to a new environment (rest) if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda state = state.cuda( ) # sets the .cuda() to the state to keep track of the gpu else: state = next_state # the state is equal to the next state continue if len( r_memory ) < opt.mem_size / 10: # if the length of the r memory is less than the parsers memory size / 10 continue # continues epoch += 1 # increments epoch +1 batch = sample( r_memory, min(len(r_memory), opt.mini_batch_size) ) # the batch is set to the sample of the r memory the minimum length of the r memory and the mini batch size from the parser stateBatch, batchReward, nextB_state, completed_batch = zip( *batch ) # the statebatch, the batch reward the next state and the completed batch are all zipped all together to a tuple stateBatch = torch.stack( tuple(state for state in stateBatch) ) # the state batch is equal to the to the cocatenates as a tuple of the states batchReward = torch.from_numpy( np.array(batchReward, dtype=np.float32)[:, None] ) # the batch reward is equal to a numpy ndarray of the batch reward as a float nextB_state = torch.stack( tuple(state for state in nextB_state) ) # the nextB state is equal to the cocatenates as a tuple of the states if torch.cuda.is_available( ): # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda stateBatch = stateBatch.cuda( ) # sets the .cuda() to the state batch to keep track of the gpu batchReward = batchReward.cuda( ) # sets the .cuda() to the batch reward to keep track of the gpu nextB_state = nextB_state.cuda( ) # sets the .cuda() to the nextB state to keep track of the gpu q_values = deepQ_model( stateBatch) # the q values are equal to the models's state batch deepQ_model.eval() # sets the model to evaluation mode for testing with torch.no_grad( ): # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up nextPred_batch = deepQ_model( nextB_state ) # the next predi batch is equal to the models's nextB state deepQ_model.train() # sets the model to training mode batch_Y = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( batchReward, completed_batch, nextPred_batch)) )[:, None] # Loops in the zip tuple of batch rewards completed batches and next pred batch and if the batch of Y is equal to a oncatenated tuple of the reward. If its not done the reward + the gamma from the parser * the predictions are stored to the batch Y. my_optim.zero_grad( ) # the gradients of the optimizer are set to zero at the begining of the mini batch loss = cn(q_values, batch_Y) # the loss is equal to the q values and the batch y loss.backward( ) # computes dloss/dx for every parameter x which has requires the grad = True my_optim.step( ) #performs a parameter update on the optimzier based on the current gradient print( "Epoch Num: {}/{}, Action: {}, Score: {}, TPieces {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_total_score, final_total_blocks, final_total_completed_lines) ) # prints the epoch number the action the final total score the final total blocks and the final completed lines for every epoch during training new_writer2.add_scalar( 'Train/Score', final_total_score, epoch - 1 ) # creates a summury scaler using tensorflow for the train score which gets the final total score and the step which is epoch -1 new_writer2.add_scalar( 'Train/TPieces', final_total_blocks, epoch - 1 ) # creates a summury scaler using tensorflow for the train TPieces which gets the final total blocks and the step which is epoch -1 new_writer2.add_scalar( 'Train/Cleared lines', final_total_completed_lines, epoch - 1 ) # creates a summury scaler using tensorflow for the train cleared lines which gets the final total completed lines and the step which is epoch -1 if epoch > 0 and epoch % opt.store_interval == 0: # if the epoch is greater than 0 and the epoch % the stored interval is equal to 0 torch.save( deepQ_model, "{}/tetris_{}".format(opt.saved_path, epoch) ) # the trained model and epochsis saved to the saved path which is the trained models folder. torch.save( deepQ_model, "{}/tetris".format(opt.saved_path) ) # saves the trained model to the saved path from the parser which is the trained models folder
RL.save_experience(observation, action, reward, observation_) if (step > 200) and (step % 5 == 0): RL.learn() # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, gamma=0.9, ) env.after(100, run_maze) env.mainloop() RL.plot_loss()
def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) self.q_eval = DeepQNetwork(self.n_actions, input_dims=self.input_dims, name=Config.get_name('_q_eval')) self.q_next = DeepQNetwork(self.n_actions, input_dims=self.input_dims, name=Config.get_name('_q_next'))
import numpy as np env = gym.make('LunarLander-v2') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) # Initialize DQN DQN = DeepQNetwork( n_y=env.action_space.n, n_x=env.observation_space.shape[0], learning_rate=0.01, replace_target_iter=100, memory_size=500, batch_size=32, epsilon_max=0.9, epsilon_greedy_increment=0.001 ) RENDER_ENV = False EPISODES = 500 rewards = [] RENDER_REWARD_MIN = 0 total_steps_counter = 0 for episode in range(400): observation = env.reset()
Tensorflow: 1.0 gym: 0.8.0 """ import gym from deep_q_network import DeepQNetwork env = gym.make('MountainCar-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.001, gamma=0.9) total_steps = 0 for i_episode in range(10): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) position, velocity = observation_
# swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze(arg.mazeSize, arg.mazeSize) RL = DeepQNetwork(n_actions=len(env.action_space), n_features=1 ,# len(env.position) ==> when state = (x,y) representation #env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, e_greedy_increment = 0.01, hidden_layers=[10, 10], replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
# Reset environment env.reset() # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() init_screen = InputExtractor.get_screen(env=env, device=device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space n_actions = env.action_space.n # Only use defined parameters if there is no previous output being loaded if RUN_TO_LOAD != None: # Initialize and load policy net policy_net = DeepQNetwork(screen_height, screen_width, n_actions) policy_net.load_state_dict(NET_STATE_DICT) policy_net.to(device) policy_net.eval() else: # Initialize policy net policy_net = DeepQNetwork(screen_height, screen_width, n_actions) policy_net.to(device) # Copy target net from policy net target_net = DeepQNetwork(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # Only use defined parameters if there is no previous output being loaded if RUN_TO_LOAD != None: