def main(): env = GameEnv(show=False) agent = DQNAgent(env=env) no_episodes = 500 for episode in prog_bar(range(no_episodes), ascii=False, unit="episodes"): state = env.reset().reshape(1, 2) while True: action = agent.predict_action(state) new_state, reward, done = env.action(action) new_state = new_state.reshape(1, 2) agent.remember(state, action, reward, new_state, done) agent.model_train() agent.target_train() state = new_state.reshape(1, 2) if done: if reward == 500: print(f"Completed in episode {episode}") agent.save_model(f"final-{episode}.model") break
def run(arguments) -> None: #Create the env env = GameEnv() agent1 = DeepQNetwork.restore(arguments["TRAINED_MODEL_1"]) agent2 = DeepQNetwork.restore(arguments["TRAINED_MODEL_2"]) # Test the agent that was trained for e_test in range(TEST_Episodes): state = env.reset() state = np.reshape(state, [1, agent1.nS]) tot_reward1 = 0 tot_reward2 = 0 if WRITE_VIDEO and e_test == 0: fig = plt.figure() frames = [] for t_test in range(1000): if SHOW_GAME: show_game(env.render_env(), t_test, tot_rewards1, tot_rewards2) if WRITE_VIDEO and e_test == 0: temp = env.render_env() frames.append([ plt.text(0, -1, "Time: " + str(t_test), fontsize=8), plt.text(7, -1, "Total reward - player 1: " + str(tot_reward1) + ", player 2: " + str(tot_reward2), fontsize=8), plt.imshow(temp, animated=True) ]) agent1_action = agent1.test_action(state) agent2_action = agent2.test_action(state) reward1, reward2 = env.move(agent1_action, agent2_action) nstate = tf.reshape(env.contribute_metrix(), [-1]) nstate = np.reshape(nstate, [1, agent1.nS]) tot_reward1 += reward1 tot_reward2 += reward2 #DON'T STORE ANYTHING DURING TESTING state = nstate if t_test == 999: print("episode: {}/{}, scores: {}, {}".format( e_test, TEST_Episodes, tot_reward1, tot_reward2)) break if WRITE_VIDEO and e_test == 0: Writer = matplotlib.animation.writers['ffmpeg'] writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800) ani = matplotlib.animation.ArtistAnimation(fig, frames, interval=20, blit=True) ani.save('movies/' + arguments["TRAINED_MODEL_1"].split('/')[-1] + '_test_2players.mp4', writer=writer) print(f'Video saved.')
from stable_baselines import DQN import stable_baselines from env import GameEnv from stable_baselines.a2c.utils import conv, linear, conv_to_fc from stable_baselines.deepq.policies import FeedForwardPolicy import tensorflow as tf import numpy as np def modified_cnn(scaled_images, **kwargs): activ = tf.nn.relu layer_1 = activ(conv(scaled_images, 'c1', n_filters=64, filter_size=2, stride=1, **kwargs)) layer_2 = activ(conv(layer_1, 'c2', n_filters=128, filter_size=2, stride=1, **kwargs)) layer_3 = activ(conv(layer_2, 'c3', n_filters=256, filter_size=2, stride=1, **kwargs)) layer_3 = conv_to_fc(layer_3) return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2))) class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, cnn_extractor=modified_cnn, feature_extraction="cnn") # env = stable_baselines.common.vec_env.DummyVecEnv(lambda: GameEnv()) env = GameEnv() model = DQN(CustomPolicy, env,verbose=1).learn(total_timesteps=int(6e7)) model.save('model')
#!/usr/bin/env python3 # encoding=utf-8 import matplotlib.pyplot as plt import matplotlib.animation import numpy as np from env import GameEnv env = GameEnv() env.reset() temp = env.render_env() i = 0 fig = plt.figure() frames = [] while i<100: temp = env.render_env() frames.append([plt.text(0, -1, "Time: " + str(i), fontsize=8), plt.imshow(temp,animated=True)]) # plt.imshow(temp) # plt.text(3, -1, f'reward 1: {r1}, reward 2: {r2}', fontsize=8) # plt.show(block=False) # plt.pause(0.01) action1 = np.random.randint(8) action2 = np.random.randint(8) r1, r2 = env.move(action1, action2) i += 1 if r1 or r2: print(i, 'r1: ', r1, 'r2', r2)
import torch.optim as optim import matplotlib import matplotlib.pyplot as plt import torch.nn.functional as F from env import GameEnv from rewards import Reward1 from trainers.dqn import DQNTrainer from networks import DQN is_ipython = 'inline' in matplotlib.get_backend() if is_ipython: from IPython import display plt.ion() env = GameEnv(reward_class=Reward1) # define transformation function # 4*4 matrix def get_state(self: DQNTrainer): with np.errstate(divide='ignore'): board = np.where(self.env.board != 0, np.log2(self.env.board), 0) return torch.from_numpy( np.ascontiguousarray(board)).unsqueeze(0).float().to(self.device) # 4*16 matrix
import numpy as np import gym from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam from rl.agents.cem import CEMAgent from rl.memory import EpisodeParameterMemory from env import GameEnv ENV_NAME = 'CartPole-v0' # Get the environment and extract the number of actions. env = GameEnv(shape=(5, 5)) nb_actions = env.action_space.n # Option 2: deep network model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax'))
from env import GameEnv import numpy as np env = GameEnv(type="q_table") done = False no_episodes = 500000 # number of rounds to train the AI learning_rate = 0.2 # between 0 and 1, higher value = higher learning rate (in this case freezes if set very high) discount = 0.95 # more inclined to prefer future rewards than immediate rewards epsilon = 0.9 # how much we want to explore the environment epsilon_decay = 0.9998 # decrease exploration each step (multiplied by epilson) show_every = 100 # at which frequency do we want to see the progress visually state = env.reset() q_table = {} for a in range(10): for b in range(10): q_table[(a, b)] = [np.random.uniform(-5, 0) for i in range(4)] # number of actions for episode in range(no_episodes): while True: if np.random.random() > epsilon: # will be false in the beginning but will diminish towards the end action = np.argmax(q_table[state]) else: action = np.random.randint(0, 4) new_state, reward, done = env.action(action)