def train(self): self.reset() score_logger = ScoreLogger(self.env_name) for run in range(self.max_iter): state = self.env.reset(self.num) if self.log_state: print("STATE") print(state) state = np.reshape(state, [1, self.observation_space]) step = 0 while self.num < self.max_examples: step += 1 #self.env.render() action = self.act(state) state_next, reward, terminal, info = self.env.step(action, self.num) if self.log_state: print(action, reward, terminal) #reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, self.observation_space]) self.remember(state, action, reward, state_next, terminal) if self.num >= self.max_examples: terminal = True state = state_next if terminal: str = "Run: {:d}, exploration: {:.5f}, score: {:d}, memory: {:d}, balance: {:.2f}" print(str.format(run+1, self.exploration_rate, step, len(self.memory), self.env.getBalance())) score_logger.add_score(step, run+1) break self.experience_replay() if self.num >= self.max_examples: break
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) # reshape to row array step = 0 while True: # run the Deep-Q Net step += 1 env.render() action = dqn_solver.act(state) # make action according to e-greedy state_next, reward, terminal, info = env.step(action) # observe SARS from env reward = reward if not terminal else -reward # ? negative state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) # store transition state = state_next # advance to next state if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay() if run % SAVE_FREQUENCY == 0: save_model(dqn_solver.model, "saved_model") # save the model parameters every once in a while
def cartpole(iteration=0, params=None): if params is None: params = DefaultParams() env = gym.make(params.ENV_NAME) score_logger = ScoreLogger(params.ENV_NAME, iteration, params.EXP_NAME, params.FIXED_NB_RUNS) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space, params) run = 0 done = False while done == False: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 #env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) done = score_logger.add_score(step, run) break dqn_solver.experience_replay()
def cartpole(): env = gym.make(ENV_NAME) env.spec.reward_threshold = 1000 env.spec.max_episode_steps = 1000 score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) if LOAD and os.path.exists(LOAD_MODULE): dqn_solver.load_mode() run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print( "Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) if len(score_logger.scores) > 0 and step > max(score_logger.scores) > 450: dqn_solver.save_model(step) score_logger.add_score(step, run) break dqn_solver.experience_replay()
def cartpole(): env = Env('localhost:32822') env.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.box.shape[0] action_space = env.action_space.discrete.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() # print(state) state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 # env.render() print("acting on state: ", state) action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) plt.plot(dqn_solver.loss) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Episode') plt.savefig("loss.png") break dqn_solver.experience_replay()
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(action_space, is_partial_fit=False) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 # comment next line for faster learning, without stopping to show the GUI env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay()
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n # NEEDED TO RUN CODE ONLY ON CPU ON DGX config = tf.ConfigProto( device_count = {'GPU': 1} ) sess = tf.Session(config=config) set_session(sess) dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 #env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay()
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) #dqn_solver.load("./save/cartpole-dqn.h5") run = 0 for e in range(EPISODES): run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 #env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay() if step % 5 == 0: dqn_solver.save("./save/cartpole-dqn_step_{}.h5".format(step))
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space, False) run = 0 # while True: for i in range(200): run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward print(state_next) state_next = np.reshape(state_next, [1, observation_space]) print(state_next) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay() dqn_solver.save_model_and_table()
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 #env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break if step % TRAINING_STEP == 1: dqn_solver.experience_replay()
def mountain_car(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(action_space, is_partial_fit=True) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 max_position = -1.2 while True: step += 1 # comment next line for faster learning, without stopping to show the GUI #env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward # if max_position < state_next[0]: # reward = 0 # if terminal and reward > 0: # reward = 100 state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next max_position = max(max_position, state[0][0]) if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step) + ", max: " + str(max_position)) score_logger.add_score(step, run) break dqn_solver.experience_replay()
def cartpole2(): env = gym.make(ENV_NAME) env._max_episode_steps = 700 score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 score = [[0, 0, 0]] agents_weights = [] check = [50, 100, 200, 400, 500, 1000, 2000, 3000] while mean_score_check(np.mean(score, axis=1)) < 500: run += 1 score_times = [] for i in range(n_times): step = 0 state = env.reset() state = np.reshape(state, [1, observation_space]) done = False if i == 0: while (not done): step += 1 #env.render() action = dqn_solver.act(state) agents_weights.append(dqn_solver.model_q.get_weights()) state_next, reward, done, info = env.step(action) reward = reward if not done else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, done) state = state_next dqn_solver.experience_replay() else: while (not done): step += 1 # env.render() action = dqn_solver.act_eval(state) #agents_weights.append(dqn_solver.model_q.get_weights()) state_next, reward, done, info = env.step(action) reward = reward if not done else -reward state_next = np.reshape(state_next, [1, observation_space]) state = state_next if done: score_times.append(step) print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(np.mean(score_times))) score_logger.add_score(int(np.mean(score_times)), run) score.append(score_times) #print(score_times) if run in check: np.save('weights', np.array(agents_weights)) np.save('scores', np.array(score)) np.save('weights', np.array(agents_weights)) np.save('scores', np.array(score))
def cartpole(): #Creating the gym environment env = gym.make(ENV_NAME) env = gym.wrappers.Monitor(env, "./vid", video_callable=lambda episode_id: True, force=True) #initializing the score logger to visualize later on the performance of the AI score_logger = ScoreLogger(ENV_NAME) """ Observation: Type: Box(4) Num Observation Min Max 0 Cart Position -4.8 4.8 1 Cart Velocity -Inf Inf 2 Pole Angle -24 deg 24 deg 3 Pole Velocity At Tip -Inf Inf Actions: Type: Discrete(2) Num Action 0 Push cart to the left 1 Push cart to the right """ #There are 4 possible observations in this environment, so observation space will be equal to 4 observation_space = env.observation_space.shape[0] #There are 2 possible actions, moving to the left and moving to the right as seen above action_space = env.action_space.shape[0] #DQNSolver is the "AI", the agent that from the list of observations and actions will try to determine the best actions for given circumstances (observation) #Initializing the dqn_solver dqn_solver = DQNSolver(observation_space, action_space) #Run is a variable to track how many runs it has been run = 0 while True: run += 1 #When you call env.reset(), it returns the initial state as a np.ndarray of shape (4,) since there are 4 observations state = env.reset() #Reshaping the state into a 2d array of (1,4) state = state.reshape(1, observation_space) step = 0 while True: #Each step is a new action undertaken by the agent step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay() env.close()
def cartpole(GAMMA, LEARNING_RATE, EXPLORATION_DECAY): #environment - cartpole env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) #observational space - possible state values observation_space = env.observation_space.shape[0] #action space - possible actions that can be performed action_space = env.action_space.n #agent - object of DQN Solver class, see below dqn_solver = DQNSolver(observation_space, action_space, GAMMA, LEARNING_RATE, EXPLORATION_DECAY) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 #visualize environment #env.render() #determine action action = dqn_solver.act(state) #determine new state and corresponding reward state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) #remember to learn - used in experience replay dqn_solver.remember(state, action, reward, state_next, terminal) #set future state as current state state = state_next if terminal: #Add score to score logger once pendulum falls score_logger.add_score(step, run) break #calling experience replay to update Q value dqn_solver.experience_replay(GAMMA, EXPLORATION_DECAY) #return runs to solve if score_logger.solve_score != 0: return score_logger.solve_score
def cartpole(): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) port = 12345 # connection to hostname on the port. s.connect(('192.168.43.110', port)) print("Connected to environment") score_logger = ScoreLogger(ENV_NAME) observation_space = 4 action_space = 4 dqn_solver = DQNSolver(observation_space, action_space) run = 0 for _ in range(10): run += 1 func = dict() func["function"] = "render" s.send(pickle.dumps(func)) state = pickle.loads(s.recv(1024))["state"] step = 0 while True: startTime = time.time() step += 1 state = np.reshape(state, [1, observation_space]) action = dqn_solver.act(state) # action = 0 func["function"] = "step" func["action"] = action s.send(pickle.dumps(func)) recieved = pickle.loads(s.recv(1024)) state_next, reward, terminal = recieved["state"], recieved[ "reward"], recieved["terminal"] print(state, reward, terminal) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) print(state_next.shape) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal or time.time() - startTime > 30: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay() dqn_solver.save_model_and_table()
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(action_space, is_partial_fit=True) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 # run = 0 while True: step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape( state_next, [1, observation_space ]) # Gives a new shape to an array without changing its data. dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration_rate: " + str(dqn_solver.exploration_rate) + ", episodes: " + str(step)) # score_logger.add_score(step, run) break dqn_solver.experience_replay()
def mountaincar(): # initialize game and score logger env = gym.make(ENV_NAME) # tool created to display 'score' score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 score = 0 numTries = 200 # continue indefinitely to train and perfect the mdoel while run < numTries: run += 1 # a new frame state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 # keep stepping til you fall out of frame while True: # more steps = more successful (even if it is moving) env.render() # choose what to do using DQN action = dqn_solver.act(state) # analyze what happened state_next, reward, dead, info = env.step(action) # reinforce positive outcome, penalize bad outcome reward = get_reward(state_next) step += reward state_next = np.reshape(state_next, [1, observation_space]) # memorize this iteration to shape the following dqn_solver.memorize(state, action, reward, state_next, dead) state = state_next if dead: # score = # of steps taken in a particular run (too many steps is bad) print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.replay()
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) # Size of observations observation_space = env.observation_space.shape[0] # Size of actions action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) game_nb = 0 while True: game_nb += 1 # Game creation/reset state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 finished = False # While game does not end while not finished: step += 1 env.render() # Decide which move action = dqn_solver.act(state) # Perform the move state_next, reward, finished, info = env.step(action) state_next = np.reshape(state_next, [1, observation_space]) # If the game has finished (we failed) invert the reward reward = reward if not finished else -reward # Save the movement and its reward dqn_solver.remember(state, action, reward, state_next, finished) state = state_next # Learn from experience dqn_solver.experience_replay() if finished: # Save result print ("Game number: " + str(game_nb) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, game_nb)
def training(): env = gym.make(ENV_NAME) # If the user chooses an environment with a non-discrete action space, return an error because DQN only works with discrete action spaces if (type(env.action_space) != gym.spaces.discrete.Discrete): raise ActionSpaceError( 'This environment uses an action space that is not discrete. DQN can only be trained using discrete action spaces. Please select an envionment with a discrete action space.' ) act_space = env.action_space.n score_logger = ScoreLogger(ENV_NAME) observation_input = find_input_shape(env) dims = reshape_dims(env.observation_space) dqn_solver = DQNSolver(observation_input, act_space) for i in range(NUM_EPISODES): state = env.reset() #reshape state array if it has more than one dimension if (len(dims) > 1): state = state.reshape(dims) step = 0 while True: step += 1 if (WATCH_TRAINING): env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward #reshape state array if it has more than one dimension if (len(dims) > 1): state_next = state_next.reshape(dims) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(i + 1) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, i + 1) break dqn_solver.experience_replay() return dqn_solver
def main(): # Kornyezet import env = gym.make(ENV_NAME) # Pont szamolas score_logger = ScoreLogger(ENV_NAME) # Observation space + action space definialas observation_space = env.observation_space.shape[0] action_space = env.action_space.n # RL modell dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.add_to_memory(state, action, reward, state_next, terminal) state = state_next if terminal: result = "Run: " + str(run) + ", exploration: " + str( dqn_solver.exploration_rate) + ", score: " + str(step) print(result) score_logger.add_score(step, run) break dqn_solver.experience()
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(action_space, is_partial_fit=IS_PARTIAL_FIT) run = 0 current_max = 0 while run < NO_ITERATIONS: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 # run = 0 while True: step += 1 if RENDER: env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) if MY_LOGS: print(state_next, reward, terminal) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) # Gives a new shape to an array without changing its data. dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next current_max = max(current_max, step) if terminal: print("Run: " + str(run) + ", exploration_rate: " + str( dqn_solver.exploration_rate) + ", episodes: " + str(step), ", max episodes reached: ", str(current_max)) if (SCORE_LOGGER): score_logger.add_score(step, run) break dqn_solver.experience_replay()
def main(): n_episode = 50 for method in ['DoubleDQN']: env = gym.make(ENV_NAME) agent = None if method == 'DDPG': agent = DDPG(env) elif method == 'QLeaning': agent = QLearningTabular(env) elif method == 'DQN': agent = DQN(env) elif method == 'DoubleDQN': agent = DoubleDQN(env) else: raise NotImplementedError score_logger = ScoreLogger(ENV_NAME, method) print('Algorithm:', method) i_episode = 0 while i_episode < n_episode: i_episode += 1 agent.run_begin() state = env.reset() i_step = 0 while True: i_step += 1 # env.render() action = agent.act(state) state_next, reward, terminal, info = env.step(action) agent.remember(state, action, reward, state_next, terminal) agent.experience_replay() state = state_next if terminal: agent.run_finish() print(f"Run {i_episode} finished after {i_step + 1} steps") score_logger.add_score(i_step, i_episode) break env.close()
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space action_space = env.action_space dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space.shape[0]]) step = 0 over2 = False over0 = False over3 = False while True: step += 1 action = dqn_solver.act(state) env.render() state_next, reward, terminal, info = env.step(action) # reward = reward if not terminal else -20 if state_next[0] > -0.2 and not over2: over2 = True reward = 1 if state_next[0] > 0 and not over0: over0 = True reward = 1 if state_next[0] > 0.2 and not over3: over3 = True reward = 1 if state_next[0] >= 0.5: reward = 20 # reward = -reward state_next = np.reshape(state_next, [1, observation_space.shape[0]]) dqn_solver.remember(state, action, reward, state_next, terminal) dqn_solver.experience_replay() dqn_solver.target_train() state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(reward)) print("state: " + str(state_next[0, 0])) # score_logger.add_score(reward, run) break
def cartpole(): env = gym.make("CartPole-v1") score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) dqn_solver.build_fresh() # dqn_solver.load_model() run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) if dqn_solver.train_mode: dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) is_solved = score_logger.add_score(step, run) if is_solved and dqn_solver.train_mode: dqn_solver.save_model() exit() break if dqn_solver.train_mode: dqn_solver.experience_replay()
def traffic(): env = gym.make(ENV_NAME) #env = gym.wrappers.Monitor(env, "dqn") env.seed(1) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 while run < 1: run += 1 step = 0 state = env._reset() obs0, reward_previous, don, signal = state reward_current = 0 total_reward = reward_previous - reward_current print("STATUS") print(signal) if (signal == 0): status = 0 elif (signal == 1): status = 1 next_state = obs0 while step < 1000: #env.render() step += 1 action = dqn_solver.act(state) #print(next_state) #action = env.action_space.sample() #obs1, reward_previous, done, _ = env.step(action) if (status == 0 and action == 0): print("Status is: 0. Action is 0.") status = 0 next_state, reward_current, done, _, t_step = phase(env, 0, 15) step += t_step elif (status == 0 and action == 1): print("Status is 0. Action is now 1. Switching to Status 1.") phase(env, 2, 25) #print("Action is 1. Status is 0. Lights are H-Y, V-R -> H-R, V-G") status = 1 next_state, reward_current, done, _, t_step = phase(env, 1, 45) step += t_step elif (status == 1 and action == 1): print("Status is 1. Action is 1.") status = 1 next_state, reward_current, done, _, t_step = phase(env, 1, 15) step += t_step elif (status == 1 and action == 0): print("Status is 1. Action is now 0. Switching to Status 0.") phase(env, 4, 25) status = 0 next_state, reward_current, done, _, t_step = phase(env, 0, 45) step += t_step total_reward = reward_previous - reward_current state_next = np.reshape(next_state, [1, OBS_SPACE]) dqn_solver.remember(state, action, total_reward, state_next, done) state = state_next score_logger.add_score(step, run) print(step) if done: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) #score_logger.add_score(step, run) break dqn_solver.experience_replay() print("Episode done in %d steps, total reward %.2f" % (step, total_reward)) env.close()
def cartpole(): #Creating the gym environment env = gym.make(ENV_NAME) #saving all the episodes to a folder vid in a video format recorder = gym.wrappers.monitoring.video_recorder.VideoRecorder(env, base_path="./every_100") #initializing the score logger to visualize later on the performance of the AI score_logger = ScoreLogger(ENV_NAME) """ Observation: Type: Box(4) Num Observation Min Max 0 Cart Position -4.8 4.8 1 Cart Velocity -Inf Inf 2 Pole Angle -24 deg 24 deg 3 Pole Velocity At Tip -Inf Inf Actions: Type: Discrete(2) Num Action 0 Push cart to the left 1 Push cart to the right """ #There are 4 possible observations in this environment, so observation space will be equal to 4. These are game specific inputs observation_space = env.observation_space.shape[0] #There are 2 possible actions, moving to the left and moving to the right as seen above action_space = env.action_space.n #DQNSolver is the "AI", the agent that from the list of observations and actions will try to determine the best actions for given circumstances (observation) #Initializing the dqn_solver dqn_solver = DQNSolver(observation_space, action_space) #Run is a variable to track how many runs it has been run = 0 while True: run += 1 #When you call env.reset(), it returns the initial state as a np.ndarray of shape (4,) since there are 4 observations state = env.reset() #Reshaping the state into a 2d array of (1,4) state = state.reshape(1, observation_space) step = 0 while True: #Each step is a new action undertaken by the agent step += 1 #Comment this line below if you don't want to see the animation rendered live. It will make it the computations quicker if run % RECORD_EVERY == 0: # env.render() recorder.capture_frame() #Return an action based on the state action = dqn_solver.act(state) #Environment returns informations based on the action decided by the dqn_solver state_next, reward, terminal, info = env.step(action) #Positive or negative reward reward = reward if not terminal else -reward #The next state state_next = state_next.reshape(1, observation_space) #This is how our AI has "memory" dqn_solver.remember(state, action, reward, state_next, terminal) #update the state from the past to present state = state_next #Terminal means the episode/run has finished if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay() env.close()
def runCat3D(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n ''' method = 1: use DQNSolver method = 2: use cyclic_action, which makes the cat land on its feet (under the current initial conditions and parameters) method = 3: user inputs the actions ''' method = 1 if method == 1: print("Using DQNSolver.") dqn_solver = DQNSolver(observation_space, action_space) fileWeights = "weights3D_simpler.h5" #uncomment to start off with saved weights #dqn_solver.load_weights(fileWeights) elif method == 2: print("Using cyclic method.") else: print("User inputs.") run = 0 average_score = 0 while run < 100: #True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: env.render() if method == 1: action = dqn_solver.act(state) elif method == 2: action = cyclic_action(step, env) else: action = input("Enter action") action = int(action) state_next, reward, terminal, info = env.step(action) reward = -reward #reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) if method == 1: dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: average_score += reward if method == 1: print("Run: " + str(run) + ", exploration: " + str(round(dqn_solver.exploration_rate, 4)) + ", score: " + str(round(reward, 2))) else: print("Run: " + str(run) + ", score: " + str(round(reward, 2))) #score_logger.add_score(int(reward), run) break if method == 1: dqn_solver.experience_replay() if run % 50 == 0: dqn_solver.save_weights(fileWeights) step += 1 print("Total runs: " + str(run) + ", average score: " + str(round(average_score / run, 2))) input("End. Press any key") env.close()
def connect4dqn(): env = Connect4() score_logger = ScoreLogger('Connect4') player1won = 0 player2won = 0 observation_space = env.reset().shape action_space = env.validMoves().size # Assign GPU to DGX config = tf.ConfigProto(device_count={'GPU': 2}) sess = tf.Session(config=config) set_session(sess) dqn_solver = DQNSolver(observation_space, action_space) run = 0 state = env.reset( ) #moved one loop up. otherwise player two wont be able to start if player one wins while True: run += 1 if run % 50 == 0: print('Saving weights and starting evaluation...') dqn_solver.save() score, ties = evaluate_dqn(env, dqn_solver, 1000) score_logger.add_score(score + ties, run) #logging ties as success step = 0 while True: step += 1 player = env.getNextPlayer() if player == 1: action_player1 = dqn_solver.act(state, env) state_next, reward_player1, terminal, info = env.makeMove( player, action_player1) state_copy = np.copy(state) state_next_copy = np.copy(state_next) if terminal: dqn_solver.pop( ) # if player 1 wins, pop player 2's last move from and give it a negative reward dqn_solver.remember(normalized_state, action_player2, reward_player1 * -1, normalized_state_next, terminal) dqn_solver.remember(state, action_player1, reward_player1, state_next, terminal) state = state_next else: normalized_state = np.roll(state, 1, axis=-1) action_player2 = dqn_solver.act(normalized_state, env) # userInput = int(input("Which row silly Human? ")) # action_player2 = userInput state_next, reward_player2, terminal, info = env.makeMove( player, action_player2) normalized_state_next = np.roll(state_next, 1, axis=-1) if terminal: dqn_solver.pop( ) # if player 2 wins, pop player 1's last move from and give it a negative reward dqn_solver.remember(state_copy, action_player1, reward_player2 * -1, state_next_copy, terminal) dqn_solver.remember(normalized_state, action_player2, reward_player2, normalized_state_next, terminal) state = state_next if terminal: if player == 1: player1won += 1 else: player2won += 1 try: winRatio = player1won / player2won except ZeroDivisionError: winRatio = 0 print('Win ratio: {}'.format(winRatio)) print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", moves: " + str(step)) break for i in range(20): dqn_solver.experience_replay()
def cartpole(): env = UnityEnv(environment_filename=ENV_NAME, worker_id=2, use_visual=False, multiagent=True) score_logger = ScoreLogger(ENV_NAME) agents_brain = [] agents_action = [] index_list = [] agents_alive = [] count = 0 count1 = 0 num_agents = env.number_agents print("___________Number of agents in cartpole __") print(num_agents) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) print("__dqn solver______") print(dqn_solver) #model = tf.keras.models.load_model("") for x in range((env.number_agents)): agents_brain.append(dqn_solver) print("______agentbrain____") print(agents_brain) print("_Agent action___") print(agents_action) learning_brain = copy.deepcopy(agents_brain) run = 0 state = env.reset() initialstate = copy.deepcopy(state) while True: run += 1 env.reset() print("____________STATE____________-") print(state[0]) state = copy.deepcopy(initialstate) agents_brain = [] agents_action = [] index_list = [] agents_alive = [] count = 0 count1 = 0 num_agents = int(state[0][-5]) agents_brain = copy.deepcopy(learning_brain) print(learning_brain) print(agents_brain) print(state) #for x in range ( (env.number_agents - 1) ): step = 0 while True: step += 1 env.render() print("___________STatte Lenth_______") print(len(state)) print("______selffish___") print(state[0]) agents_action = [1] * len(state) copied_agents_alive = copy.deepcopy(agents_alive) print("__________numagents_____") for x in range(num_agents - 1): state[x] = np.reshape(state[x], [1, observation_space]) agents_action[x] = agents_brain[x].act(state[x]) print(agents_action) state_next, reward, terminal, info = env.step( agents_action, num_agents) print("_______Reward________") print(reward) print("_____________NEXT STATE LENGTH____________") print(len(state_next)) if (len(state_next) == 0): break agents_alive = state_next[0][-13:-5] num_agents = int(state_next[0][-5]) print("_______num agnets in cartpole________") print(num_agents) print("_____index list") print(index_list) print(agents_alive) agents_alive1 = np.delete(agents_alive, index_list) print("_______Alive agent list_______") print(agents_alive1) flag = False # del agents_alive[index_list[x]] for x in range(len(agents_alive)): if (agents_alive[x] == float(1)): for y in range(len(index_list)): if (index_list[y] == x): flag = True if (flag == False): index_list.append(x) flag = False index_to_remove = [] for x in range(len(agents_alive1)): if (agents_alive1[x] == float(1)): learning_brain[index_list[count]] = agents_brain[x] index_to_remove.append(x) count = count + 1 agents_brain = [ i for j, i in enumerate(agents_brain) if j not in index_to_remove ] print("____________AGENTS_BRAIN_________") print(len(agents_brain)) print("_______________Terminal_____________") print(terminal) if (terminal[0] == True): print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) for x in range(len(copied_agents_alive)): learning_brain[x] = agents_brain[count1] count1 = count1 + 1 for x in range(len(learning_brain)): learning_brain[x].save(str(run) + "brain" + str(x) + ".h5") break for x in range(num_agents - 1): state[x] = np.reshape(state[x], [1, observation_space]) state_next[x] = np.reshape(state_next[x], [1, observation_space]) agents_brain[x].remember(state[x], agents_action[x], reward[x], state_next[x], terminal[x]) agents_brain[x].experience_replay() state = state_next
MEMORY_SIZE = 1000000 BATCH_SIZE = 32 EXPLORATION_MAX = 1.0 EXPLORATION_MIN = 0.01 # 1% of the time the agent will explore EXPLORATION_DECAY = 0.995 N_EPISODES = 1001 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2" # Set Parameters env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) state_size = env.observation_space.shape[0] action_size = env.action_space.n output_dir = 'model_output/cartpole' if not os.path.exists(output_dir): os.makedirs(output_dir) class DQNNN(nn.Module): def __init__(self, state_size, action_size): super(DQN, self).__init__() self.dense1 = nn.Linear(state_size, 24) self.dense2 = nn.Linear(24, 24) self.output = nn.Linear(23, action_size)