Example #1
0
File: main.py Project: magnord/dqn
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    game = Game()

    with tf.Session() as sess:
        with tf.device('/cpu:0'):
            dqn = DQN(sess, game, flags.FLAGS)
            dqn.train()
def eval_game(game: Game, dqn: DQN, action, q_vals, queue, root_index, root=True):
    """
    Called by look_ahead function. Used to evaluate a state, update Q value,
    enumerate and enqueue possible child actions.
    By default, this treats the root actions first.
    Args:
        game, A Game object to be evaluated
        dqn, A deep Q learning network object to evaluate state
        action, A tuple representing an action and its optional target
        q_vals, A shared mem array for the global Q vales
        queue, A Queue to store child actions
        root_index, The index of the root action in q_vals
        root(=True), Whether or not these are the root actions
    Returns:
        self
    """

    #   (local) copy game object, perform action, get state feature vector, evaluate
    perform_action(action, game.current_player, game)
    state = get_state(game)
    
    #Pass to Tensorflow here to evaluate
    s_val = dqn.get_q_value(state, "dqn")

    print("Action:", action)
    print("Q value: %f", s_val)

    """
def tf_worker(dqn: DQN, s_queue):
    """
    A single process that removes evaluation tasks from a queue.
    It constructs the NN in TensorFlow and uses it to evaluate board states sent to it.
    Args:
        dqn, an uninitialized TensorFlow object representing the DQN
        s_queue, the queue of board states.
    Returns:
        self
    """

   #Perform TensorFlow initialization 
    with tf.Graph().as_default() as dqn.tf_graph:
        dqn.build_model() 
        with tf.Session() as dqn.tf_session:
            dqn._init_tf()

    try:
        index, state = s_queue.get(True, 5)
        while index != -1:
            #Reshape to align with network input.
            state = state.reshape(1, 263)
            #Pass to Tensorflow here to evaluate
            s_val = dqn.get_q_value(state, "dqn")

            #   (global) Update root action with evaluation; average child values
            if not isclose(q_vals[index], 0.0, rel_tol=1e-6):
                q_vals[index] = (q_vals[index] + s_val)/2
            else:
                q_vals[index] = s_val
            index, state = s_queue.get(True, 5)
            
    except Empty as e:
        raise GameTreeFailure
    except:
        raise
Example #4
0
	def __init__(self):
		self.last_action = Action()
		self.time_step = 0
		self.total_time_step = 0
		self.episode_step = 0
		self.populating_phase = False

		self.model_save_interval = 30

		# Switch learning phase / evaluation phase
		self.policy_frozen = False

		self.dqn = DQN()
		self.state = np.zeros((config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32)
		self.exploration_rate = self.dqn.exploration_rate
		self.exploration_rate_for_evaluation = 0.05
		self.last_observed_screen = None
Example #5
0
def run_dqn():
    # get command line arguments, defaults set in utils.py
    agent_params, dqn_params, cnn_params, prog_params = parse_args()

    env = gym.make(agent_params['environment'])
    episodes = agent_params['episodes']
    steps = agent_params['steps']
    steps_to_update = agent_params['steps_to_update']
    skipping = agent_params['skipping']
    num_actions = env.action_space.n
    observation_shape = env.observation_space.shape
    display = prog_params['display']
    monitor = prog_params['monitor']
    verbose = prog_params['verbose']

    if verbose > 0:
        print("num actions: ", num_actions)
        print("observation_shape: ", observation_shape)

    # initialize dqn learning
    dqn = DQN(num_actions, observation_shape, dqn_params, cnn_params, prog_params)

    if monitor:
        env.monitor.start('./outputs/experiment-' + agent_params['run_id'])
    last_100 = deque(maxlen=100)

    total_steps = 0
    for i_episode in range(episodes):
            observation = env.reset()
            reward_sum = 0

            for t in range(steps):
                    if display:
                        env.render()

                    # Use the previous action if in a skipping frame
                    if total_steps % skipping == 0:
                        # select action based on the model
                        action = dqn.select_action(observation)

                    # execute actin in emulator
                    new_observation, reward, done, _ = env.step(action)
                    new_observation = new_observation.ravel()

                    # Only update the network if not in a skipping frame
                    if total_steps % skipping == 0:
                        # update the state
                        dqn.update_state(action, new_observation, reward, done)

                        # train the model
                        dqn.train_step()

                    observation = new_observation

                    reward_sum += reward

                    if done:
                            if verbose > 0:
                                print("Episode ", i_episode)
                            if verbose > 1:
                                print("Finished after {} timesteps".format(t+1))
                                print("Reward for this episode: ", reward_sum)
                            if verbose > 0:
                                last_100.append(reward_sum)
                                print("Average reward for last 100 episodes: ", np.mean(last_100))
                            break

                    if total_steps % steps_to_update == 0:
                        if verbose > 0:
                            print("Total steps : ", total_steps)
                            print("Updating target network...")
                        dqn.update_target()

                    total_steps += 1
    if monitor:
        env.monitor.close()
Example #6
0
class Agent(RLGlueAgent):
	def __init__(self):
		self.last_action = Action()
		self.time_step = 0
		self.total_time_step = 0
		self.episode_step = 0
		self.populating_phase = False

		self.model_save_interval = 30

		# Switch learning phase / evaluation phase
		self.policy_frozen = False

		self.dqn = DQN()
		self.state = np.zeros((config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32)
		self.exploration_rate = self.dqn.exploration_rate
		self.exploration_rate_for_evaluation = 0.05
		self.last_observed_screen = None

	def preprocess_screen(self, observation):
		screen_width = config.ale_screen_size[0]
		screen_height = config.ale_screen_size[1]
		new_width = config.ale_scaled_screen_size[0]
		new_height = config.ale_scaled_screen_size[1]
		if len(observation.intArray) == 100928: 
			observation = np.asarray(observation.intArray[128:], dtype=np.uint8).reshape((screen_width, screen_height, 3))
			observation = spm.imresize(observation, (new_height, new_width))
			# Clip the pixel value to be between 0 and 1
			if config.ale_screen_channels == 1:
				# Convert RGB to Luminance
				observation = np.dot(observation[:,:,:], [0.299, 0.587, 0.114])
				observation = observation.reshape((new_height, new_width, 1))
			observation = observation.transpose(2, 0, 1) / 255.0
			observation /= (np.max(observation) + 1e-5)
		else:
			# Greyscale
			if config.ale_screen_channels == 3:
				raise Exception("You forgot to add --send_rgb option when you run ALE.")
			observation = np.asarray(observation.intArray[128:]).reshape((screen_width, screen_height))
			observation = spm.imresize(observation, (new_height, new_width))
			# Clip the pixel value to be between 0 and 1
			observation = observation.reshape((1, new_height, new_width)) / 255.0
			observation /= (np.max(observation) + 1e-5)

		observed_screen = observation
		if self.last_observed_screen is not None:
			observed_screen = np.maximum(observation, self.last_observed_screen)

		self.last_observed_screen = observation
		return observed_screen

	def agent_init(self, taskSpecString):
		pass

	def reshape_state_to_conv_input(self, state):
		return state.reshape((1, config.rl_agent_history_length * config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]))

	def dump_result(self, reward, q_max=None, q_min=None):
		if self.time_step % 50 == 0:
			if self.policy_frozen is False:
				print "time_step:", self.time_step,
				
			print "reward:", reward,
			print "eps:", self.exploration_rate,
			if q_min is None:
				print ""
			else:
				print "Q ::",
				print "max:", q_max,
				print "min:", q_min

	def dump_state(self, state=None, prefix=""):
		if state is None:
			state = self.state
		state = self.reshape_state_to_conv_input(state)
		for h in xrange(config.rl_agent_history_length):
			start = h * config.ale_screen_channels
			end = start + config.ale_screen_channels
			image = state[0,start:end,:,:]
			if config.ale_screen_channels == 1:
				image = image.reshape((image.shape[1], image.shape[2]))
			elif config.ale_screen_channels == 3:
				image = image.transpose(1, 2, 0)
			image = np.uint8(image * 255.0)
			image = Image.fromarray(image)
			image.save(("%sstate-%d.png" % (prefix, h)))

	def learn(self, reward, epsode_ends=False):
		if self.policy_frozen is False:

			self.dqn.store_transition_in_replay_memory(self.reshape_state_to_conv_input(self.last_state), self.last_action.intArray[0], reward, self.reshape_state_to_conv_input(self.state), epsode_ends)
			if self.total_time_step <= config.rl_replay_start_size:
				# A uniform random policy is run for 'replay_start_size' frames before learning starts
				# 経験を積むためランダムに動き回るらしい。
				print "Initial exploration before learning starts:", "%d/%d" % (self.total_time_step, config.rl_replay_start_size)
				self.populating_phase = True
				self.exploration_rate = config.rl_initial_exploration
			else:
				self.populating_phase = False
				self.dqn.decrease_exploration_rate()
				self.exploration_rate = self.dqn.exploration_rate

				if self.total_time_step % (config.rl_action_repeat * config.rl_update_frequency) == 0 and self.total_time_step != 0:
					self.dqn.replay_experience()

				if self.total_time_step % config.rl_target_network_update_frequency == 0 and self.total_time_step != 0:
					print "Target has been updated."
					self.dqn.update_target()

	def agent_start(self, observation):
		print "Episode", self.episode_step, "::", "total_time_step:",
		if self.total_time_step > 1000:
			print int(self.total_time_step / 1000), "K"
		else:
			print self.total_time_step
		observed_screen = self.preprocess_screen(observation)
		self.state[0] = observed_screen

		return_action = Action()
		action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate)
		return_action.intArray = [action]

		self.last_action = copy.deepcopy(return_action)
		self.last_state = self.state

		return return_action

	def agent_step(self, reward, observation):
		observed_screen = self.preprocess_screen(observation)
		self.state = np.roll(self.state, 1, axis=0)
		self.state[0] = observed_screen

		########################### DEBUG ###############################
		# if self.total_time_step % 500 == 0 and self.total_time_step != 0:
		# 	self.dump_state()

		self.learn(reward)
		
		return_action = Action()
		q_max = None
		q_min = None
		if self.time_step % config.rl_action_repeat == 0:
			action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate)
		else:
			action = self.last_action.intArray[0]
		return_action.intArray = [action]

		self.dump_result(reward, q_max, q_min)

		if self.policy_frozen is False:
			self.last_action = copy.deepcopy(return_action)
			self.last_state = self.state
			self.time_step += 1
			self.total_time_step += 1

		return return_action

	def agent_end(self, reward):
		self.learn(reward, epsode_ends=True)

		# [Optional]
		## Visualizing the results
		self.dump_result(reward)

		if self.policy_frozen is False:
			self.time_step = 0
			self.total_time_step += 1
			self.episode_step += 1

	def agent_cleanup(self):
		pass

	def agent_message(self, inMessage):
		if inMessage.startswith("freeze_policy"):
			self.policy_frozen = True
			self.exploration_rate = self.exploration_rate_for_evaluation
			return "The policy was freezed."

		if inMessage.startswith("unfreeze_policy"):
			self.policy_frozen = False
			self.exploration_rate = self.dqn.exploration_rate
			return "The policy was unfreezed."

		if inMessage.startswith("save_model"):
			if self.populating_phase is False:
				self.dqn.save()
			return "The model was saved."
Example #7
0
testarg = parser.add_argument_group('Test')
testarg.add_argument("--display", dest="display", help="Display screen during testing.")
testarg.set_defaults(display=False)
testarg.add_argument("--random_starts", type=int, default=30, help="Perform max this number of no-op actions to be performed by the agent at the start of an episode.")
testarg.add_argument("--ckpt_dir", default='model', help="Tensorflow checkpoint directory.")
testarg.add_argument("--out", help="Output directory for gym.")
testarg.add_argument("--episodes", type=int, default=100, help="Number of episodes.")
testarg.add_argument("--seed", type=int, help="Random seed.")

args = parser.parse_args()

if args.seed:
    rand.seed(args.seed)

if not os.path.exists(args.ckpt_dir):
	os.makedirs(args.ckpt_dir)

# initialize gym environment and dqn
env = Environment(args)
agent = DQN(env, args)

# train agent
Trainer(agent).run()

# play the game
env.gym.monitor.start(args.out, force=True)
agent.play()
env.gym.monitor.close()

import multiprocessing as mp
import sys

def q_par(dqn, g1):
    print(dqn.get_q_value(g1, "dqn"))

def par(a):
    print(a)

features = 20
h1 = 10
h2 = 5

g1 = np.arange(40).reshape(2, 20)

dqn = DQN(features, h1, h2, "models/tf_multi_1")
with dqn.tf_graph.as_default():
    dqn.build_model()
    with tf.Session() as dqn.tf_session:
        dqn._init_tf()
        print(dqn.tf_session.run(dqn.model, feed_dict={dqn.s_: g1}))

processes = []

for p in range(4):
    processes.append(mp.Process(target=q_par, args=(dqn,g1)))
    processes[p].start()

for p in range(4):
    processes[p].join()
Example #9
0
#Basic script to test the dqn.py object and functions
import tensorflow as tf
import numpy as np
from dqn import DQN

features = 20
h1 = 10
h2 = 5

g1 = np.arange(40).reshape(2, 20)
g2 = np.random.randint(-5,5,40).reshape(2,20)

dqn = DQN(features, h1, h2, "models/dqn_1")
with tf.Graph().as_default():
    dqn.build_model()
    with tf.Session() as dqn.tf_session:
        dqn._init_tf()
        print(dqn.tf_session.run(dqn.model, feed_dict={dqn.s_: g1}))
    
    

print(dqn.get_q_value(g1, "dqn"))
print(dqn.get_q_value(g1, "dqn"))
print(dqn.get_q_value(g1, "dqn"))
#!/usr/bin/env python

from dqn import DQN

actl = DQN()
actl.train()


Example #11
0
class Agent():
    def __init__(self, env, gamma, lr, n_actions, input_dim, ann_layer, 
                mem_size, batch_size, epsilon, 
                eps_min=0.01, eps_dec=5e-7, replace=500, path='tmp'):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.path = path
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayMemory(mem_size)

        self.q_eval = DQN(input_dim, ann_layer, n_actions, self.batch_size)
        self.q_next = DQN(input_dim, ann_layer, n_actions, self.batch_size)

        self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr)
        self.loss = nn.SmoothL1Loss()
        self.last_loss = 0

    def choose_action(self, state):
        if np.random.random() > self.epsilon:
            state = self.env.transform_state(state)
            self.q_eval.eval()
            with torch.no_grad():
                actions = self.q_eval(state.reshape(1, -1))
            self.q_eval.train()
            return actions.argmax().item()
        else:
            return np.random.choice(self.action_space)

    def store_transition(self, state, action, reward, done, next_state):
        self.memory.push(state.reshape(1, -1), action, reward, done, next_state.reshape(1, -1))

    def sample_memory(self):
        state, action, reward, done, next_state = self.memory.sample(self.batch_size)
        device = self.q_eval.device
        state, action, reward, done, next_state = \
                             state.to(device), action.to(device), reward.to(device), done.to(device), next_state.to(device)
        return state, action, reward, done, next_state
    
    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                        if self.epsilon - self.eps_dec > self.eps_min \
                            else self.eps_min

    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        
        self.optimizer.zero_grad()

        self.replace_target_network()

        state, action, reward, done, next_state = self.sample_memory()
        state = self.env.transform_state(state)
        reward = self.env.transform_reward(reward)
        next_state = self.env.transform_state(next_state)
        q_pred = self.q_eval(state)
        q_pred = q_pred[torch.arange(self.batch_size), action.long()]
        q_next = self.q_next(next_state).max(1)[0]

        q_next[done] = 0.0
        q_target = reward + self.gamma * q_next
        
        loss = self.loss(q_pred, q_target.detach()).to(self.q_eval.device)
        self.last_loss = loss.item()
        loss.backward()
        
        self.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()

    def save(self, path):
        self.q_eval.save_model(self.path)

    def load(self, path):
        self.q_eval.load_model(self.path)
        self.q_next.load_model(self.path)