コード例 #1
0
def Main():
    print("Starting")

    env = gameEnv(partial=False, size=9)
    env = gameEnv(partial=True, size=9)

    Train(env)
    Test(env)

    print("Finished")
コード例 #2
0
def show_game(ckpt, game_len):
    """
    loads a model up from ckpt and plays a single game to show.
    """
    scope = cur_scope()
    env = gridworld.gameEnv(partial=False, size=5)
    tf.reset_default_graph()

    qnet = get_dummy_net(scope, env.state.shape, env.actions)
    plt.ion()
    plt.show()
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)
        restore_from_ckpt(sess, scope, ckpt, env.state.shape, env.actions)
        state = env.reset()
        for _ in range(game_len):
            plt.imshow(state)
            plt.draw()
            plt.pause(.1)
            action = qnet.predict(sess, np.array([state]))[0]
            state, _, _ = env.step(action)

    plt.close()
    plt.ioff()
コード例 #3
0
    def __init__(self):
        self.batch_size = 64  # How many experiences to use for each training step
        self.train_frequency = 5  # How often you update the network
        self.num_epochs = 20  # How many epochs to train when updating the network
        self.y = 0.99  # Discount factor
        self.prob_random_start = 0.6  # Starting chance of random action
        self.prob_random_end = 0.1  # Ending chance of random action
        self.annealing_steps = 1000.  # Steps of training to reduce from start_e -> end_e
        self.max_num_episodes = 10000  # Max number of episodes you are allowes to played to train the game
        self.min_pre_train_episodes = 100  # Number of episodes played with random actions before to start training.
        self.max_num_step = 50  # Maximum allowed episode length
        self.goal = 15  # Number of rewards we want to achieve while playing a game.

        # Set env
        self.env = gameEnv(partial=False, size=5)

        # Reset everything from keras session
        K.clear_session()

        # Setup our Q-networks
        self.main_qn = Qnetwork()
        self.target_qn = Qnetwork()

        # Setup our experience replay
        self.experience_replay = ExperienceReplay()
コード例 #4
0
def test_model(ckpt, num_games, game_len):
    """
    Loads up a model and plays a number of games with it. Doesn't do
    any training, but reports the net reward at the end.
    """
    scope = cur_scope()
    env = gridworld.gameEnv(partial=False, size=5)
    tf.reset_default_graph()

    qnet = get_dummy_net(scope, env.state.shape, env.actions)
    net_reward = 0
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)
        restore_from_ckpt(sess, scope, ckpt, env.state.shape,
                           env.actions)
        for _ in range(num_games):
            state = env.reset()
            for i in range(game_len):
                action = qnet.predict(sess, np.array([state]))[0]
                state, reward, _ = env.step(action)
                net_reward += reward

    print('num_games =', num_games,
          '    game_len =', game_len,
          '    score =', net_reward,
          '    ckpt =', ckpt)
コード例 #5
0
def evaluate(env_name, online_net):
    # env = gym.make(env_name)
    env = gameEnv(partial=False, size=32, object_size=1)
    avg_reward = 0.
    for ep in range(20):
        state = env.reset()
        # screen = env.render(mode='rgb_array')

        frame_state = np.zeros((4 * 3, 32, 32), dtype=np.float32)
        frame_state[9:, :, :] = np.transpose(state, (2, 0, 1))

        total_reward = 0

        step = 0.0
        while (step < maxSteps):
            q_values = online_net(
                Variable(
                    torch.from_numpy(frame_state).unsqueeze(0).cuda().type(
                        torch.cuda.FloatTensor)))

            action = get_action(q_values, 0.05, env)
            next_state, reward, done = env.step(action)

            total_reward += reward

            # screen = env.render(mode='rgb_array')

            next_frame_state = np.copy(frame_state)
            for i in range(3):
                next_frame_state[i * 3:(i + 1) *
                                 3, :, :] = next_frame_state[(i + 1) *
                                                             3:(i + 2) *
                                                             3, :, :]

            # next_frame_state[9:, :, :] = np.transpose(process_frame(screen), (2, 0, 1))
            next_frame_state[9:, :, :] = np.transpose(next_state, (2, 0, 1))
            frame_state = next_frame_state

            if done:
                break

        print('Evaluation done of episode:', ep + 1)

        avg_reward += total_reward

    avg_reward /= 20.

    return avg_reward
コード例 #6
0
ファイル: part7_drqn.py プロジェクト: JudyMRSD/DRL_learn
    def __init__(self):
        # Define your network architecture here. It is also a good idea to define any training operations
        # and optimizers here, initialize your variables, or alternately compile your model here.
        self.gymEnv = gameEnv(partial=False, size=5)
        #self.stateSize = self.gymEnv.observation_space.shape[0]
        self.actionSize = self.gymEnv.actions
        self.maxSteps = 50  # limit maximum steps for cartpole
        self.numEpisodes = 10000
        self.learningRate = 0.0001

        self.epsilon_start = 1
        self.epsilon = self.epsilon_start
        self.epsilon_end = 0.1

        self.annealingSteps = 10000
        self.epsilon_decay = (self.epsilon_start -
                              self.epsilon_end) / self.annealingSteps

        self.max_epLength = 50

        # Discount factor : 1: for MountainCar, and 0:99 for CartPole and Space Invaders.
        self.gamma = 0.99
        self.network = "dueling_double"
        self.batch_size = 32

        self.skip = 4

        # create a main model and target model
        self.model = self._createModel()
        self.target_model = self._createModel()
        # initialize the target model so that the parameters in the two models are the same
        self.update_target_model()

        # sumnmary for loss
        self.loss = tf.placeholder(tf.float32)
        # print("loss shape",tf.shape(self.loss)) #shape=(?,)
        tf.summary.scalar('loss-81ffb76', tf.reduce_mean(self.loss))

        self.reward = tf.placeholder(tf.float32)
        tf.summary.scalar('reward', self.reward)

        self.currentEpsilon = tf.placeholder(tf.float32)
        tf.summary.scalar('epsilon', self.currentEpsilon)

        self.Q = tf.placeholder(tf.float32)
        tf.summary.scalar('Q', self.Q)
        self.merged = tf.summary.merge_all()
コード例 #7
0
def combine_nets(combine_dir, submodels_dir, episodes=0):
    """
    Combines a set of Neural Networks by taking the avg value for each
    neuron, weighted by the number of experiences.
    :param combine_dir: directory to save combined model to
    :param submodel_dir: directory to load models from
    :param episodes: base number of episodes to add to model total. Ends up being the total count Ends up being the total count
    :return: path to combined model
    """
    avg_scope = cur_scope()
    env = gridworld.gameEnv(partial=False, size=5)
    tf.reset_default_graph()

    # create the network that will be used to combine the others.
    avg_qnet = get_dummy_net(avg_scope, env.state.shape, env.actions)
    avg_vars = tf.trainable_variables(avg_scope)

    init = tf.global_variables_initializer()
    avg_saver = tf.train.Saver(
        tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=avg_scope))
    models = get_models_to_combine(submodels_dir)
    with tf.Session() as sess:
        sess.run(init)

        for i, m in enumerate(models):
            _, m_scope, eps = m.split(DELIM)
            eps = int(eps)
            m_qnet = get_dummy_net(m_scope, env.state.shape, env.actions)
            restore_from_ckpt(sess, m_scope,
                              os.path.join(submodels_dir, m + '.ckpt'),
                              env.state.shape, env.actions)
            m_vars = tf.trainable_variables(m_scope)
            for avg, loaded in zip(avg_vars, m_vars):
                sess.run(
                    avg.assign(
                        avg.value() * episodes / (episodes + eps)
                        + loaded.value() * eps / (episodes + eps)))
            episodes += eps

        # Save the combined NN
        ckpt_path = make_ckpt_path(combine_dir, avg_scope, episodes)
        avg_saver.save(sess, ckpt_path)
    return ckpt_path
コード例 #8
0
ファイル: DRQN_runner.py プロジェクト: JudyMRSD/DRL_learn
 def __init__(self):
     self.path = '../model/'
     self.network = 'DRQN_runner'
     self.numEpisodes = 10000
     self.learningRate = 0.0001
     self.epsilon_start = 1
     self.epsilon = self.epsilon_start
     self.epsilon_end = 0.1
     self.annealingSteps = 10000
     self.epsilon_decay = (self.epsilon_start - self.epsilon_end)/self.annealingSteps
     self.gamma = 0.99
     self.gymEnv = gameEnv(partial=False, size=5)
     self.actionSize = self.gymEnv.actions
     self.duelDQN = duelDQN(self.learningRate, self.actionSize)
     self.model = self.duelDQN.model
     self.target_model = self.duelDQN.target_model
     self.batch_size = 32
     self.skip = 4
     self.max_epLength = 50
     self.update_Q_steps = 1000
# This code deals with "partial obserbability markov decision process" question with deep recurrent q network and convolution layer

import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import os
import csv
import itertools
import tensorflow.contrib.slim as slim
# %matplotlib inline
from helper import *
from gridworld import gameEnv

env = gameEnv(partial=False, size=9)
env = gameEnv(partial=True, size=9)


# @
# You build neural network
class Qnetwork():
    """
    This class is for q network object\n
    Args:
        1.h_size:
        1.rnn_cell:
        1.myScope:
    """
    def __init__(self, h_size, rnn_cell, myScope):
        # Network takes in frame as vectorized array from game
コード例 #10
0
ファイル: dqn.py プロジェクト: psFournier/deep
 def __init__(self, size, max_steps, total_steps):
     self.env = gameEnv(partial=False, size=size)
     self.max_steps = max_steps
     self.total_steps = total_steps
コード例 #11
0
def part_4():
    import gym
    import numpy as np
    import random
    import tensorflow as tf
    import tensorflow.contrib.slim as slim
    import matplotlib.pyplot as plt
    import scipy.misc
    import os

    from gridworld import gameEnv

    env = gameEnv(partial=False, size=5)
    # print(env)
    # print(env.renderEnv())


    class Qnetwork():
        def __init__(self, h_size):
            # The network recieves a frame from the game, flattened into an array.
            # It then resizes it and processes it through four convolutional layers.
            self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32)
            self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3])
            self.conv1 = slim.conv2d(inputs=self.imageIn, num_outputs=32, kernel_size=[8, 8], stride=[4, 4], padding='VALID',
                biases_initializer=None)
            self.conv2 = slim.conv2d(inputs=self.conv1, num_outputs=64, kernel_size=[4, 4], stride=[2, 2], padding='VALID',
                biases_initializer=None)
            self.conv3 = slim.conv2d(inputs=self.conv2, num_outputs=64, kernel_size=[3, 3], stride=[1, 1], padding='VALID',
                biases_initializer=None)
            self.conv4 = slim.conv2d(inputs=self.conv3, num_outputs=h_size, kernel_size=[7, 7], stride=[1, 1], padding='VALID',
                biases_initializer=None)

            # We take the output from the final convolutional layer and split it into separate advantage and value streams.
            self.streamAC, self.streamVC = tf.split(self.conv4, 2, 3)
            self.streamA = slim.flatten(self.streamAC)
            self.streamV = slim.flatten(self.streamVC)
            xavier_init = tf.contrib.layers.xavier_initializer()
            self.AW = tf.Variable(xavier_init([h_size // 2, env.actions]))
            self.VW = tf.Variable(xavier_init([h_size // 2, 1]))
            self.Advantage = tf.matmul(self.streamA, self.AW)
            self.Value = tf.matmul(self.streamV, self.VW)

            # Then combine them together to get our final Q-values.
            self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True))
            self.predict = tf.argmax(self.Qout, 1)

            # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
            self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
            self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
            self.actions_onehot = tf.one_hot(self.actions, env.actions, dtype=tf.float32)

            self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)

            self.td_error = tf.square(self.targetQ - self.Q)
            self.loss = tf.reduce_mean(self.td_error)
            self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
            self.updateModel = self.trainer.minimize(self.loss)


    class experience_buffer():
        def __init__(self, buffer_size=50000):
            self.buffer = []
            self.buffer_size = buffer_size

        def add(self, experience):
            if len(self.buffer) + len(experience) >= self.buffer_size:
                self.buffer[0:(len(experience) + len(self.buffer)) - self.buffer_size] = []
            self.buffer.extend(experience)

        def sample(self, size):
            return np.reshape(np.array(random.sample(self.buffer, size)), [size, 5])


    def processState(states):
        return np.reshape(states, [21168])

    def updateTargetGraph(tfVars, tau):
        total_vars = len(tfVars)
        op_holder = []
        for idx, var in enumerate(tfVars[0:total_vars // 2]):
            op_holder.append(tfVars[idx + total_vars // 2].assign(
                (var.value() * tau) + ((1 - tau) * tfVars[idx + total_vars // 2].value())))
        return op_holder

    def updateTarget(op_holder, sess):
        for op in op_holder:
            sess.run(op)


    batch_size = 32  # How many experiences to use for each training step.
    update_freq = 4  # How often to perform a training step.
    y = .99  # Discount factor on the target Q-values
    startE = 1  # Starting chance of random action
    endE = 0.1  # Final chance of random action
    annealing_steps = 10000.  # How many steps of training to reduce startE to endE.
    num_episodes = 220    # How many episodes of game environment to train network with.    ##### default = 10000
    pre_train_steps = 10000  # How many steps of random actions before training begins.
    max_epLength = 50  # The max allowed length of our episode.
    load_model = False  # Whether to load a saved model.
    path = "./dqn"  # The path to save our model to.
    h_size = 512  # The size of the final convolutional layer before splitting it into Advantage and Value streams.
    tau = 0.001  # Rate to update target network toward primary network


    tf.reset_default_graph()
    mainQN = Qnetwork(h_size)
    targetQN = Qnetwork(h_size)

    init = tf.global_variables_initializer()

    saver = tf.train.Saver()

    trainables = tf.trainable_variables()

    targetOps = updateTargetGraph(trainables, tau)

    myBuffer = experience_buffer()

    # Set the rate of random action decrease.
    e = startE
    stepDrop = (startE - endE) / annealing_steps

    # create lists to contain total rewards and steps per episode
    jList = []
    rList = []
    total_steps = 0

    # Make a path for our model to be saved in.
    if not os.path.exists(path):
        os.makedirs(path)

    with tf.Session() as sess:
        sess.run(init)
        if load_model == True:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        for i in range(num_episodes):
            print('episode:', i)

            episodeBuffer = experience_buffer()
            # Reset environment and get first new observation
            s = env.reset()
            s = processState(s)
            d = False
            rAll = 0
            j = 0
            # The Q-Network
            while j < max_epLength:  # If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
                j += 1
                # Choose an action by greedily (with e chance of random action) from the Q-network
                if np.random.rand(1) < e or total_steps < pre_train_steps:
                    a = np.random.randint(0, 4)
                else:
                    a = sess.run(mainQN.predict, feed_dict={mainQN.scalarInput: [s]})[0]
                s1, r, d = env.step(a)
                s1 = processState(s1)
                total_steps += 1
                episodeBuffer.add(
                    np.reshape(np.array([s, a, r, s1, d]), [1, 5]))  # Save the experience to our episode buffer.

                if total_steps > pre_train_steps:
                    if e > endE:
                        e -= stepDrop

                    if total_steps % (update_freq) == 0:
                        trainBatch = myBuffer.sample(batch_size)  # Get a random batch of experiences.
                        # Below we perform the Double-DQN update to the target Q-values
                        Q1 = sess.run(mainQN.predict, feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 3])})
                        Q2 = sess.run(targetQN.Qout, feed_dict={targetQN.scalarInput: np.vstack(trainBatch[:, 3])})
                        end_multiplier = -(trainBatch[:, 4] - 1)
                        doubleQ = Q2[range(batch_size), Q1]
                        targetQ = trainBatch[:, 2] + (y * doubleQ * end_multiplier)
                        # Update the network with our target values.
                        _ = sess.run(mainQN.updateModel, feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 0]),
                                                mainQN.targetQ: targetQ, mainQN.actions: trainBatch[:, 1]})

                        updateTarget(targetOps, sess)  # Update the target network toward the primary network.
                rAll += r
                s = s1

                if d == True:
                    break

            myBuffer.add(episodeBuffer.buffer)
            jList.append(j)
            rList.append(rAll)
            # Periodically save the model.
            if i % 1000 == 0:
                saver.save(sess, path + '/model-' + str(i) + '.ckpt')
                print("Saved Model")
            if len(rList) % 10 == 0:
                print(total_steps, np.mean(rList[-10:]), e)
        saver.save(sess, path + '/model-' + str(i) + '.ckpt')
    print("Percent of successful episodes: " + str(sum(rList) / num_episodes) + "%")


    rMat = np.resize(np.array(rList), [len(rList) // 100, 100])
    rMean = np.average(rMat, 1)
    plt.plot(rMean)
    plt.show()
コード例 #12
0
ファイル: DRQN.py プロジェクト: huyuxiang/tensorflow_practice
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import os
import csv
import itertools
import tensorflow.contrib.slim as slim

from helper import updateTargetGraph,updateTarget,processState,saveToCenter
from gridworld import gameEnv


# partial设置为True环境为部分可观测,如果设置为False,则是完全可观测
env = gameEnv(partial=True,Size = 9)

class Qnetwork():
    def __init__(self,h_size,rnn_cell,myScope):
        self.scalarInput = tf.placeholder(shape=[None,21168],dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3])
        self.conv1 = slim.convolution2d(inputs = self.imageIn,num_outputs=32,kernel_size = [8,8],
                                        stride=[4,4],padding='VALID',biases_initializer=None,
                                        scope=myScope + "_conv1") #[-1,20,20,32]
        self.conv2 = slim.convolution2d(inputs = self.conv1, num_outputs = 64,kernel_size=[4,4],
                                        stride=[2,2],padding='VALID',
                                        biases_initializer= None,scope=myScope+"_conv2") # [-1,9,9,64]
        self.conv3 = slim.convolution2d(inputs = self.conv2,num_outputs = 64,kernel_size=[3,3],stride=[1,1],
                                        padding='VALID',biases_initializer=None,scope=myScope+"_conv3") #[-1,7,7,64]
        self.conv4 = slim.convolution2d(inputs = self.conv3,num_outputs=h_size,kernel_size=[7,7],stride=[1,1],
                                        padding='VALID',biases_initializer=None,scope=myScope+"_conv4") #[-1,1,1,h_size]
コード例 #13
0
def create_env():
    return gameEnv(False, 5, True)
コード例 #14
0
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import random
import tensorflow as tf
import os

from gridworld import gameEnv
env = gameEnv(size=5)


class Qnetwork():
    def __init__(self, h_size):
        #The network recieves a frame from the game, flattened into an array.
        #It then resizes it and processes it through four convolutional layers.
        self.scalarInput = tf.placeholder(shape=[None, 21168],
                                          dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3])
        self.conv1 = tf.contrib.layers.convolution2d( \
            inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)
        self.conv2 = tf.contrib.layers.convolution2d( \
            inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None)
        self.conv3 = tf.contrib.layers.convolution2d( \
            inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
コード例 #15
0
import numpy as np
import random
import matplotlib.pyplot as plt

#https://github.com/awjuliani/DeepRL-Agents/blob/master/gridworld.py
from gridworld import gameEnv

from keras.models import Model
from keras.layers import Input, Dense, Lambda
from keras.layers.convolutional import Conv2D
from keras import backend as K
from keras.engine.topology import Layer
from keras.layers.merge import _Merge, Multiply

env_size = 5
env = gameEnv(partial=False, size=env_size)

class Experience():

    def __init__(self, buffer_size):
        
        self.replay_buffer = []
        self.buffer_size = buffer_size

    def storeExperience(self, exp):

        if(len(exp)+self.buffer_size >= len(self.replay_buffer)):
            del self.replay_buffer[:(len(exp)+len(self.replay_buffer) - self.buffer_size)]

        self.replay_buffer.extend(exp)
コード例 #16
0
from gridworld import gameEnv
slim = tf.contrib.layers
np.random.seed(2)
tf.set_random_seed(2)  # reproducible

# Superparameters
OUTPUT_GRAPH = True
MAX_EPISODE = 30000
DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 50   # maximum time step in one episode
RENDER = False  # rendering wastes time
GAMMA = 0.9     # reward discount in TD error
LR_A = 0.0001    # learning rate for actor
LR_C = 0.001     # learning rate for critic

env = gameEnv(8)
# env.seed(1)  # reproducible
# env = env.unwrapped

N_F = 21168
N_A = 4

def processState(states):
    return np.reshape(states, [21168])


class Actor(object):
    def __init__(self, sess, n_features, n_actions, lr=0.001, h_size =512):
        self.sess = sess
        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.a = tf.placeholder(tf.int32, None, "act")
コード例 #17
0
ファイル: Example07.py プロジェクト: shsym/RL_self
# Example of Deep Recurrent Q-Network for partially observable environment
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import os
import csv
import itertools
import tensorflow.contrib.slim as slim

from helper import *
from gridworld import gameEnv

env = gameEnv(partial=True, size=9)


# class for Q-network
class Qnetwork():
    def __init__(self, h_size, rnn_cell, myScope):
        self.scalarInput = tf.placeholder(shape=[None, 21168],
                                          dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3])
        self.conv1 = slim.conv2d(inputs=self.imageIn,
                                 num_outputs=32,
                                 kernel_size=[8, 8],
                                 stride=[4, 4],
                                 padding='VALID',
                                 biases_initializer=None,
                                 scope=myScope + '_conv1')
        self.conv2 = slim.conv2d(inputs=self.conv1,
コード例 #18
0
ファイル: mediumDQN.py プロジェクト: thbeucher/DQN
For more reinforcment learning tutorials, as well as required gridworld.py file, see: https://github.com/awjuliani/DeepRL-Agents
In [ ]:
import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import os
%matplotlib inline
Load the game environment
Feel free to adjust the size of the gridworld. Making it smaller provides an easier task for our DQN agent, while making the world larger increases the challenge.
In [ ]:
from gridworld import gameEnv

env = gameEnv(partial=False,size=5)
Above is an example of a starting environment in our simple game. The agent controls the blue square, and can move up, down, left, or right. The goal is to move to the green square (for +1 reward) and avoid the red square (for -1 reward). The position of the three blocks is randomized every episode.
Implementing the network itself
In [ ]:
class Qnetwork():
    def __init__(self,h_size):
        #The network recieves a frame from the game, flattened into an array.
        #It then resizes it and processes it through four convolutional layers.
        self.scalarInput =  tf.placeholder(shape=[None,21168],dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3])
        self.conv1 = tf.contrib.layers.convolution2d( \
            inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)
        self.conv2 = tf.contrib.layers.convolution2d( \
            inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None)
        self.conv3 = tf.contrib.layers.convolution2d( \
            inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
コード例 #19
0
def main():
    env = gridworld.gameEnv(partial=False, size=5)
    input_shape = env.state.shape
    tf.reset_default_graph()
    e = .5

    # Describe the NN used to select actions
    input_layer = tf.placeholder(shape=input_shape, dtype=tf.float32)
    actions = make_nn(input_layer)

    # Create learning parameters (loss)
    next_actions = tf.placeholder(shape=(1, env.actions), dtype=tf.float32)
    loss = tf.reduce_sum(tf.square(next_actions - actions))
    train_op = tf.train.GradientDescentOptimizer(
        learning_rate=LEARN_RATE).minimize(loss)

    rewards = []

    plt.ion()
    plt.show()
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)
        try:
            saver.restore(sess, model_dir)
        except:
            print('New Model')

        for i in range(EPISODES):
            s = env.reset()  # specific state
            net_reward = 0
            done = False
            j = 0
            while not done and j < 200:
                # Choose the action greedily but with chance e of random action
                # an action function (action) becomes a specific action (a)
                # a set of action functions (actions) becomes a set of Q values (Q)
                Q = sess.run(actions, feed_dict={input_layer: s})
                a = np.argmax(Q, 1)[0]
                if np.random.rand(1) < e:
                    a = np.random.randint(0, env.actions)
                # Perform action and get the next state and reward
                next_s, reward, done = env.step(a)
                maybe_show(i, s)
                # Obtain the next Q values by feeding the new state to the network
                # and use it to train.
                next_Q = sess.run(actions, feed_dict={input_layer: next_s})
                Q[0, a] = reward + DISCOUNT * np.max(next_Q)
                sess.run(train_op, feed_dict={input_layer: s, next_actions: Q})
                net_reward += reward
                s = next_s
                j += 1

            e *= .99
            rewards.append(net_reward)
            if i % 1000 == 0:
                plt.close()
                if i >= 1000:
                    print('Round', i, '- Net Reward last 100:',
                          sum(rewards[-1000:]))
        save_path = saver.save(sess, model_dir)
        print(save_path)

    plt.close()
    plt.ioff()
    plt.plot(rewards)
    plt.show()
コード例 #20
0
Q(s,a) = V(s) + A(a)
 
Dueling DQN have resulted in improved performance, stability and faster learning time.

Deep Q-Network using both Double DQN and Dueling DQN. 

The agent learns to solve a navigation task in a basic grid world.
Goal:
Move the blue block to green while avoiding red. The agent controls the blue square
and can move up, down, left, and right.
move to green (reward =1), hitting red square (reqard =-1).
The position of the 3 blocks are randomized every episode.
'''
# load environment
# artial = fasle: the whole state of the state is visible to the network.
env = gameEnv(partial=False, size=5)  # 5*5 grid


# defining conv layer
def conv2d(x, W, b, strides=1, padding='SAME', activation=tf.nn.relu):
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding=padding)
    x = tf.nn.bias_add(x, b)
    return activation(x)


# size of final conv layer before splitting into Advantage and Value streams(Dueling)
h_size = 512

# we want 4 conv layers in Q network and target network

コード例 #21
0
def train(args):
    """
    Create a GridworldQnet and have it play games and train.
    """
    scope = cur_scope()
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)
    e = args.random_action
    env = gridworld.gameEnv(partial=False, size=5)
    tf.reset_default_graph()

    # Convert update_target_net_rate from number of games between copies to
    # the number of 'updates' between.
    update_target_net_rate = \
        args.update_target_net_rate * args.game_len * args.replays
    qnet = get_net(
        scope, env.state.shape, env.actions, args.discount,
        args.learning_rate, args.experiences_size, args.batch_size,
        update_target_net_rate)

    rewards = []

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)
        restore_from_ckpt(sess, scope, args.restore_model, env.state.shape,
                           env.actions)

        for ep in range(args.episodes):
            state = normalize_img(env.reset())
            done = False
            turn = 0
            net_reward = 0
            n_updates = 0
            while not done and turn < args.game_len:
                action = qnet.predict(sess, np.array([state]))[0]
                if np.random.rand(1) < e:
                    action = qnet.rand_action()

                next_state, reward, done = env.step(action)
                next_state = normalize_img(next_state)
                qnet.add_experience(state, action, reward, next_state, done)
                for i in range(abs(int(reward))*10):
                    # Add lots of examples when there is a reward, cuz sparse
                    qnet.add_experience(state, action, reward, next_state, done)

                turn += 1
                state = next_state
                net_reward += reward

                while qnet.experiences_full and n_updates < (turn * args.replays):
                    # This is where the actual training happens. We look back
                    # back and sample experiences we have had and learn by
                    # replaying them.
                    n_updates += args.batch_size
                    qnet.update(sess)

            e *= args.random_decay
            rewards.append(net_reward)

            if (ep + 1) % args.ckpt_every == 0 and ep > 0:
                ckpt_path = make_ckpt_path(args.model_dir, scope, ep + 1)
                print(datetime.now().strftime("%Y-%m-%d %H:%M:%S "),
                      os.path.basename(ckpt_path), ' ',
                      resource.getrusage(resource.RUSAGE_SELF).ru_maxrss//2**10,
                      'kB random_action={:.5f}'.format(e), ' game_len=', args.game_len,
                      ' Net reward last ', args.ckpt_every, ' games: ',
                      sum(rewards[-args.ckpt_every:]),
                      sep='')
                saver.save(sess, ckpt_path)
コード例 #22
0
import random
import sys
import psutil
import tensorflow as tf
import gridworld as gWorld
if "../" not in sys.path:
  sys.path.append("../")

from lib import plotting
from collections import deque, namedtuple


# In[ ]:

#env = gym.envs.make("Breakout-v0")
env = gWorld.gameEnv(True, 5);

# In[ ]:

# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]


# In[ ]:

class StateProcessor():
    """
    Processes a raw Atari iamges. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        # Build the Tensorflow graph
コード例 #23
0
from __future__ import division

import gym
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
import scipy.misc
import os

from gridworld import gameEnv

env = gameEnv(partial=False, size=5)


class Qnetwork():
    def __init__(self, h_size):
        # The network receives a frame from the game, flattened into an array.
        # It then resizes it and processes it through four convolutional layers.
        self.scalarInput = tf.placeholder(shape=[None, 21168],
                                          dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3])
        self.conv1 = slim.conv2d( \
            inputs=self.imageIn, num_outputs=32, kernel_size=[8, 8], stride=[4, 4], padding='VALID',
            biases_initializer=None)
        self.conv2 = slim.conv2d( \
            inputs=self.conv1, num_outputs=64, kernel_size=[4, 4], stride=[2, 2], padding='VALID',
            biases_initializer=None)
        self.conv3 = slim.conv2d( \
            inputs=self.conv2, num_outputs=64, kernel_size=[3, 3], stride=[1, 1], padding='VALID',
コード例 #24
0
        avg_reward += total_reward

    avg_reward /= 20.

    return avg_reward


args = parse_arguments()
num_episodes = 100000
maxSteps = 300
# num_s_iterations = 1000000
# environment_name = args.env
gamma = 1.0
C = args.C
tau = 1e-2
env = gameEnv(partial=False, size=32, object_size=1)
# env = gym.make(environment_name)
# env = gym.wrappers.Monitor(env, 'space_invaders_dqn_color_curr', video_callable=lambda episode_id: episode_id%(num_episodes//100)==0, force=True)

memory = Replay_Memory(burn_in=0)
# nets = torch.load('space_invaders_dqn_target_new.pth', map_location=lambda storage, loc: storage)
# target_net = nets['target_net']
# online_net = nets['online_net']
target_net = QNetworkConv(env)
online_net = QNetworkConv(env)
online_net.cuda()
target_net.cuda()

optimizer = optim.Adam(online_net.parameters(), lr=1e-4)

s_iter = 0