Exemple #1
0
def run():
    '''

	'''

    RENDER_TO_SCREEN = True

    # Setting up the environment
    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      rate=80,
                      max_time=100,
                      tail=False,
                      food_count=1,
                      obstacle_count=0,
                      multiplier_count=0,
                      map_path=None,
                      action_space=5)  #sets up the environment

    if RENDER_TO_SCREEN:
        env.prerender()

    Q = Qmatrix(2, env)  # 0 - zeros, 1 - random, 2 - textfile

    # Minimise the overfitting during testing
    epsilon = 0.005

    # Testing for a certain amount of episodes
    for episode in range(10):
        state, info = env.reset()
        done = False  #if epsidoe is in the range of 10 it resets the environment unfo

        while not done:
            if RENDER_TO_SCREEN:
                env.render()

            if np.random.rand() <= epsilon:
                action = env.sample_action(
                )  #if a random numpy is less than or = to epsilon then it does an action
            else:
                action = np.argmax(Q[env.state_index(
                    state)])  #else it does a different action

            new_state, reward, done, info = env.step(action)

            # Q[env.state_index(state), action] += alpha * (reward + gamma * np.max(Q[env.state_index(new_state)]) - Q[env.state_index(state), action])

            state = new_state  #gives state the value of new_state

        if episode % 1 == 0:
            print("Episode:", episode, "\tScore:", info["score"], "\tTime:",
                  info["time"])  #prints out episode, score and time
Exemple #2
0
def run2():

	# Testing
	print("Running the Linear Function Q-Learning Model from tf.Saver()")

	# Decide whether or not to render to the screen or not
	RENDER_TO_SCREEN = True

	# True - Load model from modelpath_load; False - Initialise random weights
	USE_SAVED_MODEL_FILE = True 

	# First we need our environment form Environment_for_DQN.py
	# has to have a grid_size of 10 for this current NN
	env = Environment(wrap = WRAP, 
					  grid_size = GRID_SIZE, 
					  rate = 100, 
					  max_time = 20, 
					  tail = TAIL,
					  action_space = 4)
	
	if RENDER_TO_SCREEN:
		env.prerender()

	# Hyper-parameters
	alpha = 0.01  # Learning rate, i.e. which fraction of the Q values should be updated
	gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards
	epsilon = 0.1  # Probability to choose random action instead of best action

	# Create NN model
	with tf.name_scope('Model'):
		Q_values = createModel(x)

	# Error / Loss function 
	# Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value
	with tf.name_scope('Error'):
		# e1 = tf.subtract(y, Q_values)
		# e2 = tf.square(e1)
		# error = tf.reduce_mean(e2, axis=1)
		error = tf.reduce_max(tf.square(Q_values - y), axis=1)
		# error = tf.square(tf.subtract(y, Q_values))

	# Gradient descent optimizer - minimizes error/loss function
	with tf.name_scope('Optimizer'):
		optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error)
		# optimizer = tf.train.AdamOptimizer(alpha).minimize(error)

	# The next states action-value [1,4] tensor, reduced to a scalar of the max value
	with tf.name_scope('Max_y_prime'):
		y_prime_max = tf.reduce_max(y, axis=1)

	# Action at time t, the index of the max value in the action-value tensor (Made a global variable)
	with tf.name_scope('Max_action'):
		action_t = tf.argmax(y, axis=1)

	avg_time = 0
	avg_score = 0
	avg_error = 0

	print_episode = 100
	total_episodes = 10000

	# Saving model capabilities
	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	model = tf.global_variables_initializer()

	# Tensorboard capabilties
	# writer = tf.summary.FileWriter(LOGDIR)

	# Session can start running
	with tf.Session() as sess:

		# Restore the model, to keep training
		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_LOAD)
			# Different model restore method
			# new_saver = tf.train.import_meta_graph('my-model.meta')
			# new_saver.restore(sess, tf.train.latest_checkpoint('./'))
			print("Model restored.")

		sess.run(model)

		# Testing my DQN model with random values
		for episode in range(total_episodes):
			state, info = env.reset()
			done = False

			while not done:
				if RENDER_TO_SCREEN:
					env.render()

				# One Hot representation of the current state
				state_vector = env.state_vector()

				# Retrieve the Q values from the NN in vector form
				Q_vector = sess.run(Q_values, feed_dict={x: state_vector})
				# print("Qvector",Q_vector) # DEBUGGING

				# Deciding which action to take
				if np.random.rand() <= epsilon:
					action = env.sample_action()
				else:
					# "action" is the max value of the Q values (output vector of NN)
					action = sess.run(action_t, feed_dict={y: Q_vector})

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)

				state = new_state

				if reward == 100:
					print("reached food")

				# Gathering our now current states action-value vector
				# new_state_vector = env.state_vector()
				# y_prime = sess.run(Q_values, feed_dict={x: new_state_vector})

				# Equation for training
				# maxq = sess.run(y_prime_max, feed_dict={y:y_prime})

				# Q_vector[:,action] = reward + (gamma * maxq)

				_, e = sess.run([optimizer, error], feed_dict={x: state_vector, y: Q_vector})
				# _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector})
				# e = sess.run(error,feed_dict={x:state_vector, y:Q_vector})
				# sess.run(optimizer)
				
				# DEBUGGING
				# print("action:",action)
				# print("y_prime:", y_prime)
				# print("max q value:", maxq)
				# print("new Q_vector:", Q_vector)
				# print("error tensor:", e)

				if done:
					avg_time += info["time"]
					avg_score += info["score"]
					avg_error += e

			if episode % print_episode == 0 and episode != 0:
				# print("Episode:", episode, "   Score:", info["score"])
				print("Episode:", episode, 
					"\ttime:", avg_time/print_episode, 
					"\tscore:", avg_score/print_episode, 
					"\tError", avg_error/print_episode)
				# print("error tensor:", e)
				avg_time = 0
				avg_score = 0
				avg_error = 0
Exemple #3
0
def run():
	# Testing
	print("\n ----- Running the Linear Function Q-Learning Model ----- \n")

	# Decide whether or not to render to the screen or not
	RENDER_TO_SCREEN = True

	# First we need our environment form Environment_for_DQN.py
	# has to have a grid_size of 10 for this current NN
	env = Environment(wrap = WRAP, 
					  grid_size = GRID_SIZE, 
					  rate = 100, 
					  max_time = 100, 
					  tail = TAIL,
					  action_space = 4)
	
	if RENDER_TO_SCREEN:
		env.prerender()

	epsilon = 0.01  # Probability to choose random action instead of best action

	# Create NN model
	Q_values, output_layer, hidden_1_layer = recreateModel(x)

	action_t = tf.argmax(y, axis=1)

	avg_time = 0
	avg_score = 0
	got_food = 0

	print_episode = 10
	total_episodes = 100

	# Initialising all variables (weights and biases)
	model = tf.global_variables_initializer()

	# Session can start running
	with tf.Session() as sess:

		sess.run(model)

		# Testing my DQN model with random values
		for episode in range(total_episodes):
			state, info = env.reset()
			done = False

			while not done:
				if RENDER_TO_SCREEN:
					env.render()

				# One Hot representation of the current state
				state_vector = env.state_vector()

				# Retrieve the Q values from the NN in vector form
				Q_vector = sess.run(Q_values, feed_dict={x: state_vector})
				# print(Q_vector) # DEBUGGING

				# Deciding one which action to take
				if np.random.rand() <= epsilon:
					action = env.sample_action()
				else:
					# action is the max value of the Q values (output vector of NN)
					action = sess.run(action_t, feed_dict={y:Q_vector})
					# action = sess.run(tf.argmax(Q_vector, axis=1))
					# action = np.argmax(Q[env.state_index(state)])

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)

				# Q[env.state_index(state), action] += alpha * (reward + gamma * np.max(Q[env.state_index(new_state)]) - Q[env.state_index(state), action])

				state = new_state

				if reward == 100:
					got_food += 1

				if done:
					avg_time += info["time"]
					avg_score += info["score"]


			if episode % print_episode == 0 and episode != 0:
				# print("Episode:", episode, "   Score:", info["score"])
				print("Episode:", episode, "   time:", avg_time/print_episode, "   score:", avg_score/print_episode, "    Got food", got_food, "times")
				avg_time = 0
				avg_score = 0
Exemple #4
0
def trainDeepModel(load = False):

	print("\n ---- Training the Deep Neural Network ----- \n")

	# Decide whether or not to render to the screen or not
	RENDER_TO_SCREEN = False

	# True - Load model from modelpath_load; False - Initialise random weights
	USE_SAVED_MODEL_FILE = False 

	# First we need our environment form Environment_for_DQN.py
	# has to have a grid_size of 10 for this current NN
	env = Environment(wrap = WRAP, 
					  grid_size = GRID_SIZE, 
					  rate = 80, 
					  max_time = 100, 
					  tail = TAIL,
					  action_space = 4)
	
	if RENDER_TO_SCREEN:
		env.prerender()

	# Hyper-parameters
	alpha = 0.01  # Learning rate, i.e. which fraction of the Q values should be updated
	gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards
	epsilon = 0.1  # Probability to choose random action instead of best action

	epsilon_function = True
	epsilon_start = 0.5
	epsilon_end = 0.05
	epsilon_percentage = 0.5 # in decimal

	alpha_function = False
	alpha_start = 0.01
	alpha_end = 0.003
	alpha_percentage = 0.9 # in decimal

	# Create NN model
	with tf.name_scope('Model'):
		Q_values, hidden_1_layer, hidden_2_layer, output_layer  = createDeepModel(x, load_variables = load)

	# Error / Loss function 
	# reduce_max -> it reduces the [1,4] tensor to a scalar of the max value
	with tf.name_scope('Error'):

		# test
		error = tf.losses.mean_squared_error(labels=Q_values, predictions=y)

		# error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1) # Doesn't work!
		# error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1)
		# error = tf.reduce_max(tf.square(Q_values - y), axis=1)
	
	tf.summary.scalar('error', tf.squeeze(error))

	# Gradient descent optimizer - minimizes error/loss function
	with tf.name_scope('Optimizer'):
		optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error)
		# optimizer = tf.train.AdamOptimizer(alpha).minimize(error)

	# The next states action-value [1,4] tensor, reduced to a scalar of the max value
	with tf.name_scope('Max_y_prime'):
		y_prime_max = tf.reduce_max(y, axis=1)

	# Action at time t, the index of the max value in the action-value tensor (Made a global variable)
	with tf.name_scope('Max_action'):
		action_t = tf.argmax(y, axis=1)

	avg_time = 0
	avg_score = 0
	avg_error = 0

	# error plot
	# errors = []

	print_episode = 1000
	total_episodes = 100000

	# Saving model capabilities
	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	init = tf.global_variables_initializer()

	# Adds a summary graph of the error over time
	merged_summary = tf.summary.merge_all()

	# Tensorboard capabilties
	writer = tf.summary.FileWriter(LOGDIR)

	# Session can start running
	with tf.Session() as sess:

		# Restore the model, to keep training
		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_LOAD)
			print("Model restored.")

		# Initialize global variables
		sess.run(init)

		# Tensorboard graph
		writer.add_graph(sess.graph)

		# Testing my DQN model with random values
		for episode in range(total_episodes):
			state, info = env.reset()
			done = False

			# Linear function for alpha
			if alpha_function:
				alpha = (-alpha_start / (alpha_percentage*total_episodes)) * episode + (alpha_start+alpha_end)
				if alpha < alpha_end: 
					alpha = alpha_end

			# Linear function for epsilon
			if epsilon_function:
				epsilon = (-epsilon_start / (epsilon_percentage*total_episodes)) * episode + (epsilon_start+epsilon_end)
				if epsilon < epsilon_end: 
					epsilon = epsilon_end

			while not done:
				if RENDER_TO_SCREEN:
					env.render()

				# One Hot representation of the current state
				state_vector = env.state_vector()

				# Retrieve the Q values from the NN in vector form
				Q_vector = sess.run(Q_values, feed_dict={x: state_vector})
				# print("Qvector", Q_vector) # DEBUGGING

				# Deciding one which action to take
				if np.random.rand() <= epsilon:
					action = env.sample_action()
				else:
					# "action" is the max value of the Q values (output vector of NN)
					action = sess.run(action_t, feed_dict={y: Q_vector})

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)

				state = new_state

				# if final state of the episode
				if done:
					Q_vector[:,action] = reward
					# print("Reward:", reward)
				else:
					# Gathering the now current state's action-value vector
					new_state_vector = env.state_vector()
					y_prime = sess.run(Q_values, feed_dict={x: new_state_vector})

					# Equation for training
					maxq = sess.run(y_prime_max, feed_dict={y: y_prime})

					# RL Equation
					Q_vector[:,action] = reward + (gamma * maxq)

				_, e = sess.run([optimizer, error], feed_dict={x: state_vector, y: Q_vector})
				# _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector})
				# e = sess.run(error,feed_dict={x:state_vector, y:Q_vector})
				# sess.run(optimizer)
				
				# DEBUGGING
				# print("action:", action)
				# print("y_prime:", y_prime)
				# print("max q value:", maxq)
				# print("new Q_vector:", Q_vector)
				# print("error tensor:", e)

				# add to the error list, to show the plot at the end of training - RAM OVERLOAD!!!
				# errors.append(e)

				if done:
					avg_time += info["time"]
					avg_score += info["score"]
					avg_error += e

			if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes-1):
				
				print("Ep:", episode, 
					"\tavg t:", avg_time/print_episode, 
					"\tavg score:", avg_score/print_episode, 
					"\tErr", round(avg_error/print_episode,3), 
					"\tepsilon", round(epsilon,2))
				avg_time = 0
				avg_score = 0
				avg_error = 0

				# Save the model's weights and biases to text files
				w1 = np.array(sess.run(hidden_1_layer['weights']))
				b1 = np.array(sess.run(hidden_1_layer['biases']))
				w2 = np.array(sess.run(hidden_2_layer['weights']))
				b2 = np.array(sess.run(hidden_2_layer['biases']))
				w3 = np.array(sess.run(output_layer['weights']))
				b3 = np.array(sess.run(output_layer['biases']))

				np.savetxt(W1_textfile_path_save, w1.astype(np.float), fmt='%f', delimiter = " ")
				np.savetxt(B1_textfile_path_save, b1.astype(np.float), fmt='%f', delimiter = " ")
				np.savetxt(W2_textfile_path_save, w2.astype(np.float), fmt='%f', delimiter = " ")
				np.savetxt(B2_textfile_path_save, b2.astype(np.float), fmt='%f', delimiter = " ")
				np.savetxt(W3_textfile_path_save, w3.astype(np.float), fmt='%f', delimiter = " ")
				np.savetxt(B3_textfile_path_save, b3.astype(np.float), fmt='%f', delimiter = " ")

				s = sess.run(merged_summary, feed_dict={x: state_vector, y: Q_vector})
				writer.add_summary(s, episode)

		save_path = saver.save(sess, MODEL_PATH_SAVE)
		print("Model saved in path: %s" % save_path)
Exemple #5
0
def runDeepModel():

	# Testing
	print("\n ---- Running the Deep Neural Network ----- \n")

	# Decide whether or not to render to the screen or not
	RENDER_TO_SCREEN = True

	# True - Load model from modelpath_load; False - Initialise random weights
	USE_SAVED_MODEL_FILE = False 

	# First we need our environment form Environment_for_DQN.py
	# has to have a grid_size of 10 for this current NN
	env = Environment(wrap = WRAP, 
					  grid_size = GRID_SIZE, 
					  rate = 50, 
					  max_time = 100, 
					  tail = TAIL,
					  action_space = 4)
	
	if RENDER_TO_SCREEN:
		env.prerender()

	# Hyper-parameters
	alpha = 0.01  # Learning rate, i.e. which fraction of the Q values should be updated
	gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards
	
	epsilon = 0.01  # Probability to choose random action instead of best action

	# Create NN model
	with tf.name_scope('Model'):
		Q_values, hidden_1_layer, hidden_2_layer, output_layer  = createDeepModel(x, load_variables = True)

	# Error / Loss function 
	# Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value
	with tf.name_scope('Error'):
		# e1 = tf.subtract(y, Q_values)
		# e2 = tf.square(e1)
		# error = tf.reduce_mean(e2, axis=1)

		# test
		error = tf.losses.mean_squared_error(labels=Q_values, predictions=y)

		# error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1)
		# error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1)
		# error = tf.reduce_max(tf.square(Q_values - y), axis=1)

	# Gradient descent optimizer - minimizes error/loss function
	with tf.name_scope('Optimizer'):
		optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error)
		# optimizer = tf.train.AdamOptimizer(alpha).minimize(error)

	# The next states action-value [1,4] tensor, reduced to a scalar of the max value
	with tf.name_scope('Max_y_prime'):
		y_prime_max = tf.reduce_max(y, axis=1)

	# Action at time t, the index of the max value in the action-value tensor (Made a global variable)
	with tf.name_scope('Max_action'):
		action_t = tf.argmax(y, axis=1)

	avg_time = 0
	avg_score = 0
	avg_error = 0

	print_episode = 10
	total_episodes = 100

	# Saving model capabilities
	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	model = tf.global_variables_initializer()

	# Session can start running
	with tf.Session() as sess:

		# Restore the model, to keep training
		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_LOAD)
			print("Model restored.")

		sess.run(model)

		# Testing my DQN model with random values
		for episode in range(total_episodes):
			state, info = env.reset()
			done = False

			while not done:
				if RENDER_TO_SCREEN:
					env.render()

				# One Hot representation of the current state
				state_vector = env.state_vector()

				# Retrieve the Q values from the NN in vector form
				Q_vector = sess.run(Q_values, feed_dict={x: state_vector})
				# print("Qvector",Q_vector) # DEBUGGING

				# Deciding one which action to take
				if np.random.rand() <= epsilon:
					action = env.sample_action()
				else:
					# "action" is the max value of the Q values (output vector of NN)
					action = sess.run(action_t, feed_dict={y: Q_vector})

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)

				state = new_state

				if done:
					avg_time += info["time"]
					avg_score += info["score"]

			if episode % print_episode == 0 and episode != 0:
				print("Ep:", episode, "   avg t:", avg_time/print_episode, "   avg score:", avg_score/print_episode)
				avg_time = 0
				avg_score = 0
Exemple #6
0
def train():
    '''
	Starts a function called Train
	'''

    RENDER_TO_SCREEN = False
    # RENDER_TO_SCREEN = True

    # Setting up the environment
    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      rate=80,
                      max_time=100,
                      tail=False,
                      food_count=1,
                      obstacle_count=0,
                      multiplier_count=0,
                      map_path=None,
                      action_space=5)
    '''
	Sets the state of environemnt to equal the above grid size given, sets the speed the snake moves, the max time it runs for, if there is a tail or not, the amount of food spawned, the amount of obstacles spaned, if  there is a specific path to be taken.
	'''

    if RENDER_TO_SCREEN:
        env.prerender()

    Q = Qmatrix(1, env)  # 0 - zeros, 1 - random, 2 - textfile

    alpha = 0.15  # Learning rate, i.e. which fraction of the Q values should be updated
    gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards
    epsilon = 0.1  # Probability to choose random action instead of best action
    '''
	Sets variables with values because of reasons stated above
	'''

    epsilon_function = True
    epsilon_start = 0.8
    epsilon_end = 0.05
    epsilon_percentage = 0.6  # in decimal

    avg_time = 0
    avg_score = 0

    print_episode = 1000
    total_episodes = 10000
    '''
	Sets values to variables
	'''

    for episode in range(
            total_episodes
    ):  # Takes an episode and if it is in range of the total episodes it proceeds
        # Reset the environment
        state, info = env.reset()  # Resets environment state
        done = False

        # Epsilon linear function
        if epsilon_function:
            epsilon = (
                -(epsilon_start - epsilon_end) /
                (epsilon_percentage * total_episodes)
            ) * episode + (
                epsilon_start
            )  # minuses ep_start from ep_end dived by ep_percentage times by total_episodes then timesed by epsidoe the added to ep_start
            if epsilon < epsilon_end:
                epsilon = epsilon_end
                #checks to see if ep is less than ep_end and if it is it makes ep = to ep_end

        while not done:

            # If cancelled, Q lookup table is still saved
            try:
                if RENDER_TO_SCREEN:
                    env.render()

                if np.random.rand() <= epsilon:
                    action = env.sample_action()
                else:
                    action = np.argmax(Q[env.state_index(state)])
#checks if new random no. is less than or = to ep, else it does a new action.
                new_state, reward, done, info = env.step(action)

                # print(state)

                Q[env.state_index(state), action] += alpha * (
                    reward + gamma * np.max(Q[env.state_index(new_state)]) -
                    Q[env.state_index(state), action])

                state = new_state  #assigns new value to state

                if done:
                    avg_time += info["time"]
                    avg_score += info[
                        "score"]  # adds time nd score to the score counter and prints it out

            except KeyboardInterrupt as e:
                # Test to see if I can write the Q file during runtime
                np.savetxt(Q_textfile_path_save,
                           Q.astype(np.float),
                           fmt='%f',
                           delimiter=" ")
                print("Saved Q matrix to text file")
                raise e
                #try and except work togther, this ecept is anyresponse that doesnt fall into the try section.

        if (episode % print_episode == 0 and episode != 0) or (
                episode == total_episodes - 1
        ):  #tests to see if ep mod print_ep = 0 or if ep == total_ep-1, then if it does it proceeds
            print("Episode:", episode,
                  "\tavg t: {0:.3f}".format(avg_time / print_episode),
                  "\tavg score: {0:.3f}".format(avg_score / print_episode),
                  "\tepsilon {0:.3f}".format(
                      epsilon))  #prints out episodes, score, time
            np.savetxt(Q_textfile_path_save,
                       Q.astype(np.float),
                       fmt='%f',
                       delimiter=" ")
            avg_time = 0
            avg_score = 0  #resets time and score to 0

    # This doesn't need to be here
    # np.savetxt(Q_textfile_path_save, Q.astype(np.float), fmt='%f', delimiter = " ")
    print("Simulation finished. \nSaved Q matrix to text file at:",
          Q_textfile_path_save)
Exemple #7
0
def trainDeepModel(load=False):

    # Used to see how long model takes to train - model needs to be optimized!
    start_time = time.time()

    print("\n ---- Training the Deep Neural Network ----- \n")

    # Decide whether or not to render to the screen or not
    RENDER_TO_SCREEN = True

    # True - Load model from modelpath_load; False - Initialise random weights
    USE_SAVED_MODEL_FILE = False

    # First we need our environment form Environment_for_DQN.py
    # has to have a grid_size of 10 for this current NN
    env = Environment(wrap=WRAP,
                      grid_size=GRID_SIZE,
                      rate=0,
                      max_time=300,
                      tail=TAIL,
                      food_count=FOOD_COUNT,
                      obstacle_count=OBSTACLE_COUNT,
                      action_space=3)

    if RENDER_TO_SCREEN:
        env.prerender()

    # Hyper-parameters
    alpha = 0.001  # Learning rate, i.e. which fraction of the Q values should be updated
    gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards
    epsilon = 0.1  # Probability to choose random action instead of best action

    epsilon_function = True
    epsilon_start = 0.1
    epsilon_end = 0.05
    epsilon_percentage = 0.5  # in decimal

    alpha_function = False
    alpha_start = 0.01
    alpha_end = 0.003
    alpha_percentage = 0.9  # in decimal

    # Trajectory
    tau = []

    # Create NN model
    with tf.name_scope('Model'):
        Q_values, weights, biases = createDeepModel(x, load_variables=load)

    # Error / Loss function
    # reduce_max -> it reduces the [1,4] tensor to a scalar of the max value
    with tf.name_scope('Error'):

        # test
        error = tf.losses.mean_squared_error(labels=Q_values, predictions=y)

        # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1)
        # error = tf.reduce_max(tf.square(Q_values - y), axis=1)

    tf.summary.scalar('error', tf.squeeze(error))

    # Gradient descent optimizer - minimizes error/loss function
    with tf.name_scope('Optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error)
        # optimizer = tf.train.AdamOptimizer(alpha).minimize(error)

    # The next states action-value [1,4] tensor, reduced to a scalar of the max value
    with tf.name_scope('Max_y_prime'):
        y_prime_max = tf.reduce_max(y, axis=1)

    # Action at time t, the index of the max value in the action-value tensor (Made a global variable)
    with tf.name_scope('Max_action'):
        action_t = tf.argmax(y, axis=1)

    avg_time = 0
    avg_score = 0
    avg_error = 0

    # error plot
    # errors = []

    print_episode = 1000
    total_episodes = 100000

    # Saving model capabilities
    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # Session can start running
    with tf.Session() as sess:

        # Restore the model, to keep training
        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_LOAD)
            print("Model restored.")

        # Initialize global variables
        sess.run(init)

        # Tensorboard graph
        writer.add_graph(sess.graph)

        print("\nProgram took {0:.4f} seconds to initialise\n".format(
            time.time() - start_time))
        start_time = time.time()

        # Testing my DQN model with random values
        for episode in range(total_episodes):
            state, info = env.reset()
            done = False

            # Linear function for alpha
            if alpha_function:
                alpha = (-alpha_start /
                         (alpha_percentage * total_episodes)) * episode + (
                             alpha_start + alpha_end)
                if alpha < alpha_end:
                    alpha = alpha_end

            # Linear function for epsilon
            if epsilon_function:
                epsilon = (-(epsilon_start - epsilon_end) /
                           (epsilon_percentage * total_episodes)) * episode + (
                               epsilon_start)
                if epsilon < epsilon_end:
                    epsilon = epsilon_end

            while not done:
                if RENDER_TO_SCREEN:
                    env.render()

                # One Hot representation of the current state
                state_vector = env.pixels()

                # Retrieve the Q values from the NN in vector form
                Q_vector = sess.run(Q_values, feed_dict={x: state_vector})
                # print("Qvector", Q_vector) # DEBUGGING

                # Deciding one which action to take
                if np.random.rand() <= epsilon:
                    action = env.sample_action()
                else:
                    # "action" is the max value of the Q values (output vector of NN)
                    action = sess.run(action_t, feed_dict={y: Q_vector})

                # Update environment with by performing action
                new_state, reward, done, info = env.step(action)

                # Update trajectory (Update replay memory)
                if len(tau) < REPLAY_MEMORY:
                    tau.append(
                        Traj(state_vector, action, reward, env.pixels(), done))
                    # print(tau[i].new_state)
                    # i=i+1
                else:
                    tau.pop(0)
                    tau.append(
                        Traj(state_vector, action, reward, env.pixels(), done))

                state = new_state

                # Choose a random step from the replay memory
                random_tau = random.randint(0, len(tau) - 1)

                # Get the Q vector of the training step
                Q_vector = sess.run(Q_values,
                                    feed_dict={x: tau[random_tau].state})
                '''
				Training using replay memory
				'''
                # if terminating state of episode
                if tau[random_tau].done:
                    # Set the chosen action's current value to the reward value
                    Q_vector[:,
                             tau[random_tau].action] = tau[random_tau].reward
                else:
                    # Gets the Q vector of the new state
                    y_prime = sess.run(
                        Q_values, feed_dict={x: tau[random_tau].new_state})

                    # Getting the best action value
                    maxq = sess.run(y_prime_max, feed_dict={y: y_prime})

                    # RL DQN Training Equation
                    Q_vector[:, tau[random_tau].
                             action] = tau[random_tau].reward + (gamma * maxq)

                _, e = sess.run([optimizer, error],
                                feed_dict={
                                    x: tau[random_tau].state,
                                    y: Q_vector
                                })
                '''
				Standard training with learning after every step

				# if final state of the episode
				if done:
					Q_vector[:,action] = reward
					# print("Reward:", reward)
				else:
					# Gathering the now current state's action-value vector
					new_state_vector = env.local_state_vector_3D()
					y_prime = sess.run(Q_values, feed_dict={x: new_state_vector})

					# Equation for training
					maxq = sess.run(y_prime_max, feed_dict={y: y_prime})

					# RL Equation
					Q_vector[:,action] = reward + (gamma * maxq)


				_, e = sess.run([optimizer, error], feed_dict={x: state_vector, y: Q_vector})
				# _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector})
				# e = sess.run(error,feed_dict={x:state_vector, y:Q_vector})
				# sess.run(optimizer)
				
				'''

                # DEBUGGING
                # print("action:", action)
                # print("y_prime:", y_prime)
                # print("max q value:", maxq)
                # print("new Q_vector:", Q_vector)
                # print("error tensor:", e)

                # add to the error list, to show the plot at the end of training - RAM OVERLOAD!!!
                # errors.append(e)

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):
                current_time = time.time() - start_time
                print(
                    "Ep:",
                    episode,
                    "\tavg t: {0:.3f}".format(avg_time / print_episode),
                    "\tavg score: {0:.3f}".format(avg_score / print_episode),
                    "\tErr {0:.3f}".format(avg_error / print_episode),
                    "\tepsilon {0:.3f}".format(epsilon),
                    #"\ttime {0:.0f}:{1:.0f}".format(current_time/60, current_time%60),
                    end="")
                if current_time % 60 < 10:
                    if math.floor((current_time / 60) % 60) < 10:
                        print("\ttime {0:.0f}:0{1:.0f}:0{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                    else:
                        print("\ttime {0:.0f}:{1:.0f}:0{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                else:
                    if math.floor((current_time / 60) % 60) < 10:
                        print("\ttime {0:.0f}:0{1:.0f}:{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                    else:
                        print("\ttime {0:.0f}:{1:.0f}:{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                avg_time = 0
                avg_score = 0
                avg_error = 0

                # Save the model's weights and biases to .npy files (can't save 4D array to text file)
                W_conv1 = np.array(sess.run(weights['W_conv1']))
                W_conv2 = np.array(sess.run(weights['W_conv2']))
                W_fc = np.array(sess.run(weights['W_fc']))
                W_out = np.array(sess.run(weights['W_out']))

                b_conv1 = np.array(sess.run(biases['b_conv1']))
                b_conv2 = np.array(sess.run(biases['b_conv2']))
                b_fc = np.array(sess.run(biases['b_fc']))
                b_out = np.array(sess.run(biases['b_out']))

                np.save(W_conv1_textfile_path_save, W_conv1.astype(np.float32))
                np.save(W_conv2_textfile_path_save, W_conv2.astype(np.float32))
                np.save(W_fc_textfile_path_save, W_fc.astype(np.float32))
                np.save(W_out_textfile_path_save, W_out.astype(np.float32))

                np.save(b_conv1_textfile_path_save, b_conv1.astype(np.float32))
                np.save(b_conv2_textfile_path_save, b_conv2.astype(np.float32))
                np.save(b_fc_textfile_path_save, b_fc.astype(np.float32))
                np.save(b_out_textfile_path_save, b_out.astype(np.float32))

                s = sess.run(merged_summary,
                             feed_dict={
                                 x: state_vector,
                                 y: Q_vector
                             })
                writer.add_summary(s, episode)

        save_path = saver.save(sess, MODEL_PATH_SAVE)
        print("Model saved in path: %s" % save_path)
Exemple #8
0
		lastAction = 0

		while not done:
			action = brain.chooseAction(observation)

			observation_, reward, done, info = env.step(action)
			
			observation_ = env.state_vector_3D()
			score += reward

			brain.storeTransition(observation, action, reward, observation_)

			observation = observation_
			if TRAIN: loss = brain.learn(batch_size)
			lastAction = action
			if RENDER: env.render()

		avg_score += info["score"]
		if TRAIN: avg_loss += loss.item()


		if i%100 == 0 and not i==0 or i == numGames-1:
			print("Game", i, 
				"\tepsilon: %.4f" %brain.EPSILON,
				"\tavg score", avg_score/100,
				"avg loss:", avg_loss/100)
			brain.save_model("./Models/Torch2/my_model{}.pth".format(i))
			avg_loss = 0
			avg_score = 0

		scores.append(score)