def __init__(self): #display attributes self.clock = pygame.time.Clock() pygame.init() self.font=pygame.font.Font(None,30) self.size = (1024, 768) self.size_vec = Vector2(1024, 768) self.screen = pygame.display.set_mode(self.size) self.colors = {'WHITE':(255,255,255), 'red': (255,0,0), 'blue': (0,0,255), 'black': (0,0,0)} #world attr self.g=980.0#cm/sec^2 #peg-att self.peg=Vector2(512.0,100.0) #ball-att self.ball_length=100.0 #initial state: config self.ball_theta=m.pi/2#[0,2*pi] self.ball_omega=0.0 self.ball_alpha=self.g/self.ball_length*m.sin(self.ball_theta) self.ball_theta_min = 10000 self.ball_theta_max = -10000 self.ball_omega_min = 10000 self.ball_omega_max = -10000 self.ball=Vector2(self.polar_cart()) #self.ball(x,y) self.player = QAgent(self.get_ranges())
def run_model(params): # https://stackoverflow.com/questions/11526975/set-random-seed-programwide-in-python # https://stackoverflow.com/questions/30517513/global-seed-for-multiple-numpy-imports random.seed(params.seed) np.random.seed(params.seed) # Must be called before Session # https://stackoverflow.com/questions/38469632/tensorflow-non-repeatable-results/40247201#40247201 tf.set_random_seed(params.seed) qagent = QAgent(params) if params.is_train: qagent.fit() elif params.eval_mode == 0: qagent.evaluate_mine() elif params.eval_mode == 1: qagent.test_mine() elif params.eval_mode == 2: qagent.play_mine()
from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from agent import QAgent from configs import object_seaquest_config from util import get_log_dir if __name__ == '__main__': config = object_seaquest_config log_dir = get_log_dir('log', config['game']+'_'+str(config['double_q'])) # Name of logging directory agent = QAgent(config=config, log_dir=log_dir) saver = tf.train.Saver(max_to_keep=None) saver.restore(agent.session, '%s/episode_%d.ckpt'%("log/log/2017-12-09_23-40-34_SeaquestDeterministic-v4_True",800)) print('Validate....\n==============') scores = agent.validate_episode(epsilon=0, visualise=True)
import matplotlib.pyplot as plt from util import get_log_dir import tensorflow as tf import numpy as np from agent import QAgent from configs import object_pong_config config = object_pong_config load_episode = 1000 epsilon = 0.05 # The epsilon for the strategy # Build the graph on CPU to keep gpu for training.... log_dir = get_log_dir('log', config['game'] + '_' + str(config['double_q'])) # Name of logging directory agent = QAgent(config=config, log_dir=log_dir) # Restore the values.... tf.train.Saver().restore(agent.session, 'saves/trained_Q/1000/episode_%d.ckpt' % load_episode) var = tf.trainable_variables() model_vars = [v for v in var if v.name.startswith('Model')] agent.session.run(tf.initialize_variables(model_vars)) ''' # Reset the model functions agent.model_pro = config['model']( batch_size=config['batch_size'], state_shape=config['model_state_shape']+[config['state_time']], output_state_shape=3,
from plot_util import init_figure, update_figure import tensorflow as tf import numpy as np from agent import QAgent from configs import pong_config, breakout_config if __name__ == '__main__': config = pong_config config['state_memory'] = 1 # prevent allocating of a huge chunk of memory load_episode = 3650 epsilon = 0.05 # The epsilon for the strategy # Build the graph on CPU to keep gpu for training.... with tf.device('/cpu:0'): agent = QAgent(config=config, log_dir=None) # Restore the values.... tf.train.Saver().restore( agent.session, 'saves/%s/episode_%d.ckpt' % (config['game'], load_episode)) mean = 0 total = 0 episode = 0 while True: print("\n") # Initialise the episode state = agent.reset_to_zero_state() done = False total_reward = 0.
from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from agent import QAgent from configs import pong_config, object_pong_config, breakout_config from util import get_log_dir if __name__ == '__main__': config = object_pong_config log_dir = get_log_dir('log', config['game']+'_'+str(config['double_q'])) # Name of logging directory agent = QAgent(config=config, log_dir=log_dir) saver = tf.train.Saver(max_to_keep=None) reward_list = [] for episode in range(config['episodes']): print('episode: %d, step: %d, eps: %.4f' % (episode, agent.steps, agent.epsilon)) # Store the rewards... cur_trng_reward = agent.train_episode() agent._update_training_reward(cur_trng_reward) reward_list.append(cur_trng_reward) if episode > 10: del reward_list[0] avg_trng_reward = np.mean(reward_list)
import gym from agent import QAgent env = gym.make('CartPole-v0') agent = QAgent(env) agent.train() t = agent.run() print("Time", t)
DISCOUNT_FACTOR = 0.6 SAVE_MODEL_EVERY = 0 if __name__ == '__main__': #create env env = gym.make(ENV_NAME) print(env.unwrapped.spec.id) #create agent model = QTable(nostates=env.observation_space.n, noactions=env.action_space.n, learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR) agent = QAgent(actions=env.action_space.n, expl_max=EXPLORATION_MAX, expl_min=EXPLORATION_MIN, expl_decay=EXPLORATION_DECAY, model=model) #get and parse user args args = Parser.parseargs(defaultTrainIterations=10000, defaultEvalIterations=10) if args.load: agent.load(env, args.loadversion) if args.train != 0: agent.train(env, iterations=args.train, train_s=1, save_i=SAVE_MODEL_EVERY) if args.eval != 0: print("Evaluation results (lower scores are better):")
from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from agent import QAgent from configs import object_seaquest_config from util import get_log_dir if __name__ == '__main__': config = object_seaquest_config log_dir = get_log_dir('log', config['game'] + '_' + str(config['double_q'])) # Name of logging directory agent = QAgent(config=config, log_dir=log_dir) saver = tf.train.Saver(max_to_keep=None) reward_list = [] for episode in range(config['episodes']): print('\nepisode: %d, step: %d, eps: %.4f\n\n---------------------' % (episode, agent.steps, agent.epsilon)) # Store the rewards... cur_trng_reward = agent.train_episode() agent._update_training_reward(cur_trng_reward) reward_list.append(cur_trng_reward) if episode > 10: del reward_list[0] avg_trng_reward = np.mean(reward_list)
action='store_true', help='Use ICM module') args = parser.parse_args() env = gym.make(args.gym_env) if (type(env.action_space) == Discrete): if args.use_DQN: a = QAgent(epsilon_start=args.epsilon_start, epsilon_end=args.epsilon_end, epsilon_anneal=args.epsilon_anneal, nb_actions=env.action_space.n, learning_rate=args.learning_rate, gamma=args.gamma, batch_size=args.batch_size, replay_memory_size=args.replay_memory_size, hidden_size=args.hidden_size, model_input_size=env.observation_space.shape[0], use_PER=args.use_PER, use_ICM=args.use_ICM) trainQ(a, env, args.MAX_NUMBER_OF_STEPS, args.EPISODES_TO_TRAIN, args.START_RENDERING, args.update_frequency) else: if not args.use_ICM: a = ActorCriticAgent( continuous=False, nb_actions=env.action_space.n, learning_rate=args.learning_rate, gamma=args.gamma, hidden_size=args.hidden_size,
class GameManager: def __init__(self): #display attributes self.clock = pygame.time.Clock() pygame.init() self.font=pygame.font.Font(None,30) self.size = (1024, 768) self.size_vec = Vector2(1024, 768) self.screen = pygame.display.set_mode(self.size) self.colors = {'WHITE':(255,255,255), 'red': (255,0,0), 'blue': (0,0,255), 'black': (0,0,0)} #world attr self.g=980.0#cm/sec^2 #peg-att self.peg=Vector2(512.0,100.0) #ball-att self.ball_length=100.0 #initial state: config self.ball_theta=m.pi/2#[0,2*pi] self.ball_omega=0.0 self.ball_alpha=self.g/self.ball_length*m.sin(self.ball_theta) self.ball_theta_min = 10000 self.ball_theta_max = -10000 self.ball_omega_min = 10000 self.ball_omega_max = -10000 self.ball=Vector2(self.polar_cart()) #self.ball(x,y) self.player = QAgent(self.get_ranges()) def polar_cart(self): x=int(self.peg.x-self.ball_length*m.sin(self.ball_theta)) y=int(self.peg.y-self.ball_length*m.cos(self.ball_theta)) return x,y def calculate_min_max(self): self.ball_theta_min = min(self.ball_theta_min, self.ball_theta) self.ball_theta_max = max(self.ball_theta_max, self.ball_theta) self.ball_omega_min = min(self.ball_omega_min, self.ball_omega) self.ball_omega_max = max(self.ball_omega_max, self.ball_omega) def draw_texts(self): scoretext=self.font.render("Theta: %f[%f-%f], Omega: %f[%f-%f], Alpha: %f" % (self.ball_theta, self.ball_theta_min, self.ball_theta_max, self.ball_omega, self.ball_omega_min, self.ball_omega_max, self.ball_alpha), 1,(255,255,255)) self.screen.blit(scoretext, (0, 457)) def draw(self): self.screen.fill(self.colors['black']) pygame.draw.circle(self.screen, self.colors['blue'], (int(self.peg.x), int(self.peg.y)), 10) pygame.draw.circle(self.screen, self.colors['blue'], self.ball, 5) pygame.draw.line(self.screen, self.colors['blue'], self.peg, self.ball) self.draw_texts() def get_state(self): return [self.ball_theta, self.ball_omega] def get_ranges(self): # Returns ranges for each state and last but not the least, action return [np.arange(0, m.pi*2, m.pi/64), np.arange(-100, 100, 10), np.arange(-10*self.g, 10*self.g, 1)] def tangential_force(self, f): self.ball_alpha = f/self.ball_length return self.ball_alpha # All the physics code will be added here def update(self, action): #higher order terms removed dt=0.01 x=np.array([[self.ball_theta],[self.ball_omega],[self.ball_alpha]]) F=np.array([[1.0,dt,dt*dt/2.0],[0.0,1.0,dt],[0.0,0.0,1.0]]) #print x.shape, F.shape y = np.dot(F,x) self.ball_theta=y[0][0]%(2*m.pi) self.ball_omega=(y[1][0]%10)*(y[1][0]/abs(y[1][0])) # Apply self.tangential_force(100) #action = 0 self.tangential_force(self.g*m.sin(self.ball_theta) + action) self.ball=Vector2(self.polar_cart()) self.calculate_min_max() def run(self): # Core algorithm, everything happens here prev_state = self.get_state() action = self.player.choose_action(prev_state) # Decide best action according to the agent self.update(action) # Execute that action next_state = self.get_state() # Get next state reward = self.player.get_reward(prev_state, next_state, action) self.player.update_Qvalue(prev_state, action, next_state, reward) self.draw()
from plot_util import init_figure, update_figure import tensorflow as tf import numpy as np from agent import QAgent from configs import pong_config, breakout_config if __name__ == '__main__': config = pong_config config['state_memory'] = 1 # prevent allocating of a huge chunk of memory # Build the graph on CPU to keep gpu for training.... with tf.device('/cpu:0'): agent = QAgent(config=config, log_dir=None) rewards = [] for idx in range(4, 9): load_episode = idx * config['episodes_save_interval'] epsilon = 0.05 # The epsilon for the strategy # Restore the values.... tf.train.Saver().restore( agent.session, 'log/2017-10-28_20-06-09_PongDeterministic-v4_True/episode_%d.ckpt' % (load_episode)) # Save validation reward to textfile cur_reward = agent.session.run(agent.validation_reward)