Esempio n. 1
0
    def __init__(self):
        # Our Main Model: POLICY NETWORK
        self.dqn = DeepQNetwork(model_name=MODEL_NAME,
                                input_dim=INPUT_DIM,
                                n_actions=N_ACTIONS,
                                layer1_units=LAYER1_UNITS,
                                layer2_units=LAYER2_UNITS,
                                lr=LEARNING_RATE)

        self.model = self.dqn.create_model()

        # TARGET NETWORK
        self.target_model = self.dqn.create_model()

        self.target_model.set_weights(self.model.get_weights())

        # An array with last n steps for training
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)

        # Tensorboard for logging results
        self.tensorboard = ModifiedTensorBoard(
            log_dir=f'../logs/{MODEL_NAME}-{int(time.time())}')

        # target update counter
        self.target_update_counter = 0
Esempio n. 2
0
def replay_train(mainDQN: DeepQNetwork, targetDQN: DeepQNetwork,
                 train_batch: list) -> float:
    """Trains `mainDQN` with target Q values given by `targetDQN`
    Args:
        mainDQN (DeepQNetwork``): Main DQN that will be trained
        targetDQN (DeepQNetwork): Target DQN that will predict Q_target
        train_batch (list): Minibatch of replay memory
            Each element is (s, a, r, s', done)
            [(state, action, reward, next_state, done), ...]
    Returns:
        float: After updating `mainDQN`, it returns a `loss`
    """
    states = np.vstack([x[0] for x in train_batch])
    actions = np.array([x[1] for x in train_batch[:FLAGS.batch_size]])
    rewards = np.array([x[2] for x in train_batch[:FLAGS.batch_size]])
    next_states = np.vstack([x[3] for x in train_batch])
    done = np.array([x[4] for x in train_batch[:FLAGS.batch_size]])

    predict_result = targetDQN.predict(next_states)
    Q_target = rewards + FLAGS.discount_rate * np.max(predict_result,
                                                      axis=1) * (1 - done)

    X = states
    y = mainDQN.predict(states)
    y[np.arange(len(X)), actions] = Q_target

    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(X, y)
Esempio n. 3
0
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims=self.input_dims,
                                    name=self.env_name+'_'+self.algo+'_q_eval',
                                    chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims=self.input_dims,
                                    name=self.env_name+'_'+self.algo+'_q_next',
                                    chkpt_dir=self.chkpt_dir)
Esempio n. 4
0
    def __init__(self):
        # from the origin base.agent
        self.reward = 0
        self.episodes = 0
        self.steps = 0
        self.obs_spec = None
        self.action_spec = None

        self.dqn = DeepQNetwork(
            len(smart_actions),
            10,  # one of the most important data that needs to be update manually
            learning_rate=0.001,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=200,
            memory_size=5000,
            batch_size=32,
            e_greedy_increment=None,
            output_graph=True)

        # self defined vars
        self.fighting = False
        self.player_hp = []
        self.enemy_hp = []
        self.previous_enemy_hp = []
        self.previous_player_hp = []
        self.leftover_enemy_hp = []
        self.win = 0
        self.count = 0

        self.previous_action = None
        self.previous_state = None
Esempio n. 5
0
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 input_dims,
                 batch_size,
                 n_actions,
                 max_mem_size=100000,
                 eps_end=0.01,
                 eps_dec=5e-5):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0

        self.Q_eval = DeepQNetwork(self.lr,
                                   n_actions=n_actions,
                                   input_dims=input_dims,
                                   fc1_dims=256,
                                   fc2_dims=256)
        self.state_memory = np.zeros((self.mem_size, *input_dims),
                                     dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims),
                                         dtype=np.float32)

        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
Esempio n. 6
0
def run(arguments) -> None:
    #Create the env
    env = GameEnv()
    agent1 = DeepQNetwork.restore(arguments["TRAINED_MODEL_1"])
    agent2 = DeepQNetwork.restore(arguments["TRAINED_MODEL_2"])

    # Test the agent that was trained
    for e_test in range(TEST_Episodes):
        state = env.reset()
        state = np.reshape(state, [1, agent1.nS])
        tot_reward1 = 0
        tot_reward2 = 0

        if WRITE_VIDEO and e_test == 0:
            fig = plt.figure()
            frames = []

        for t_test in range(1000):
            if SHOW_GAME:
                show_game(env.render_env(), t_test, tot_rewards1, tot_rewards2)
            if WRITE_VIDEO and e_test == 0:
                temp = env.render_env()
                frames.append([
                    plt.text(0, -1, "Time: " + str(t_test), fontsize=8),
                    plt.text(7,
                             -1,
                             "Total reward - player 1: " + str(tot_reward1) +
                             ",  player 2: " + str(tot_reward2),
                             fontsize=8),
                    plt.imshow(temp, animated=True)
                ])

            agent1_action = agent1.test_action(state)
            agent2_action = agent2.test_action(state)
            reward1, reward2 = env.move(agent1_action, agent2_action)
            nstate = tf.reshape(env.contribute_metrix(), [-1])
            nstate = np.reshape(nstate, [1, agent1.nS])
            tot_reward1 += reward1
            tot_reward2 += reward2

            #DON'T STORE ANYTHING DURING TESTING
            state = nstate
            if t_test == 999:
                print("episode: {}/{}, scores: {}, {}".format(
                    e_test, TEST_Episodes, tot_reward1, tot_reward2))
                break

        if WRITE_VIDEO and e_test == 0:
            Writer = matplotlib.animation.writers['ffmpeg']
            writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800)
            ani = matplotlib.animation.ArtistAnimation(fig,
                                                       frames,
                                                       interval=20,
                                                       blit=True)
            ani.save('movies/' + arguments["TRAINED_MODEL_1"].split('/')[-1] +
                     '_test_2players.mp4',
                     writer=writer)
            print(f'Video saved.')
Esempio n. 7
0
    def __init__(self, inputs, n_actions):
        self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain.load_state_dict(self.brain.state_dict())
        self.target_brain.eval()

        self.set_params()
        self.optimizer = torch.optim.Adam(self.brain.parameters())
        self.memory = ReplayMemory(50000)
        self.action_space = [0, 1]
Esempio n. 8
0
class DQNAgent(Agent):
    def __init__(self, *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(states)[indices, actions]

        q_next = self.q_next.forward(states_).max(dim=1)[0]
        q_next[dones] = 0.0

        q_target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()
Esempio n. 9
0
    def __init__(self, *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)
Esempio n. 10
0
    def __init__(self, *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        # define Q-evaluation network and target Q-network for the agent
        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        # we will never perform gradient descent or backpropagation with Q next network
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)
Esempio n. 11
0
    def __init__(self):
        self.dqn = DeepQNetwork(model_name=MODEL_NAME,
                                input_dim=INPUT_DIM,
                                n_actions=N_ACTIONS,
                                layer1_units=LAYER1_UNITS,
                                layer2_units=LAYER2_UNITS,
                                lr=LEARNING_RATE)
        # Our Main Model: POLICY NETWORK
        if LOAD_MODEL is not None:
            print(f"Loading {LOAD_MODEL}")
            self.model = load_model(LOAD_MODEL)
            print(f"Loaded Model: {LOAD_MODEL}")
        else:
            self.model = self.dqn.create_model()

        # TARGET NETWORK
        self.target_model = self.dqn.create_model()
        self.target_model.set_weights(self.model.get_weights())

        # An array with last n steps for training
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)

        # target update counter
        self.target_update_counter = 0
Esempio n. 12
0
    def __init__(self):
        self.dqn = DeepQNetwork(n_actions=524, n_features=13)

        self.previous_action = None
        self.previous_state = None

        self.episodes = 0
        self.steps = 0
        self.reward = 0

        self.reward_weights = np.array([
            .2,  ##blizz_score
            .2,
            .2,  ##total_unit_value, total_structure_value
            .2,
            .3,  ##killed_unit_value, killed_building_value
            .2,
            .2,  ##mineral_rate, mineral_spent
            .2,
            .1,  ##supply_used, supply_limit
            .3,
            .3,  ##army_supply,worker_supply
            .3  #army_count
        ])
Esempio n. 13
0
def main():
    env = gym.make('Catcher-v0')

    model = make_model()
    model.load_weights('data/weights520000.dat')

    net = DeepQNetwork(env, model, 10000)

    # net.train()
    net.play()
    net.play()
    net.play()
Esempio n. 14
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "hn:", ["help", "network="])
        except getopt.error as msg:
            raise Usage(msg)
    except Usage as err:
        print(sys.stderr, err.msg)
        print(sys.stderr, "for help use --help")
        return 2

    if len(opts) == 0:
        print("Please specify parameters!")
        return 1
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print(__doc__)
            return 0
        elif opt in ("-n", "--network"):
            if arg == 'dqn':
                from dqn import DeepQNetwork
                score_graph_path = './saved_model/'
                network = DeepQNetwork(
                    e_greedy=0.1,
                    output_graph=True,
                    save_path=score_graph_path,
                )
            elif arg == 'doubledqn':
                from double_dqn import DoubleDQN
                score_graph_path = './saved_model_doubledqn/'
                network = DoubleDQN(
                    e_greedy=0.1,
                    output_graph=True,
                    save_path=score_graph_path,
                )

            else:
                print(
                    "You could choose 'dqn', 'doubledqn' as network's parameter"
                )
                return 1
            train(network, score_graph_path)
            return 0
Esempio n. 15
0
def bot_play(mainDQN: DeepQNetwork, env: gym.Env) -> None:
    """Test runs with rendering and logger.infos the total score
    Args:
        mainDQN (DeepQNetwork): DQN agent to run a test
        env (gym.Env): Gym Environment
    """
    state = env.reset()
    reward_sum = 0

    while True:
        env.render()
        action = np.argmax(mainDQN.predict(state))
        state, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            logger.info("Total score: {}".format(reward_sum))
            break
Esempio n. 16
0
def CartPoleDQN():
    env = gym.make('CartPole-v0')
    env = env.unwrapped
    env.reset()

    rl0 = DeepQNetwork(env.action_space.n,
                       env.state.shape[0],
                       learning_rate=0.01,
                       reward_decay=0.9,
                       e_greedy=0.9,
                       replace_target_iter=200,
                       memory_size=100000,
                       batch_size=32,
                       e_greedy_increment=None,
                       output_graph=None)

    game = Game(env, rl0)
    game.run()
Esempio n. 17
0
def run(render):
    net = DeepQNetwork(sess,
                       N_A, N_S,
                       learning_rate=0.01,
                       reward_decay=0.9,
                       e_greedy=0.9,
                       replace_target_iter=200,
                       memory_size=2000,
                       scope='dqn_{0}'.format(0),
                       # output_graph=True
                       )

    sess.run(tf.global_variables_initializer())


    step = 0
    for episode in range(300):
        # initial observation
        s = env.reset()

        while True:
            # RL choose action based on observation
            a, q = net.choose_action(s)


            # RL take action and get next observation and reward
            s_, r, d, _ = env.step(a)
            if render: env.render()

            #print('rewards: {0}'.format(r))

            net.store_transition(s, a, r, s_)


            if (step > 200) and (step % 5 == 0):
                net.learn()

            # swap observation
            s = s_

            # break while loop when end of this episode
            if d:
                break
            step += 1
import gym
from dqn import DeepQNetwork

env = gym.make('CartPole-v0')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = DeepQNetwork(
    n_actions = env.action_space.n,
    n_features = env.observation_space.shape[0],
    lr = 0.01,
    e_greedy = 0.9,
    replace_target_iter = 100,
    memory_size = 2000,
    e_greedy_increment = 0.001
    )

total_steps = 0

for i_episode in range(100):

    observation = env.reset()
    ep_r = 0
    while True:
        env.render()

        action = RL.choose_action(observation)
Esempio n. 19
0
class DDQNAgent(object):
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims=self.input_dims,
                                    name=self.env_name+'_'+self.algo+'_q_eval',
                                    chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims=self.input_dims,
                                    name=self.env_name+'_'+self.algo+'_q_next',
                                    chkpt_dir=self.chkpt_dir)

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def replace_target_network(self):
        if self.replace_target_cnt is not None and \
           self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()

        indices = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(states_)
        q_eval = self.q_eval.forward(states_)

        max_actions = T.argmax(q_eval, dim=1)
        q_next[dones] = 0.0

        q_target = rewards + self.gamma*q_next[indices, max_actions]
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()

        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
Esempio n. 20
0
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 16384  #of sensors input

    np.random.seed(61502)

    vision = True

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 40000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0
    esar2 = []
    esar4 = []

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    #from keras import backend as K
    #K.set_session(sess)

    # 1. CREATE DQN NETWORK.
    num_actions_steering = 13  # before it was 13
    num_actions_acceleration = 9  # before it was 3
    num_actions_break = 9  # before it was 3
    num_dqn_actions = num_actions_steering * num_actions_acceleration * num_actions_break
    base_dir = "/home/sergio/Projects/apclypsr/DDPG-Keras-Torcs/"
    args = {
        "save_model_freq": 1000,
        "target_model_update_freq": 1000,
        "normalize_weights": True,
        #"learning_rate": 0.00025,
        'learning_rate': 0.00025,
        "model": None
    }
    dqn = DeepQNetwork(sess, num_dqn_actions, base_dir, args)

    # Tensorflow saver
    saver = tf.train.Saver()

    #actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    #critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    # print("Now we load the weight")
    # try:
    #     # actor.model.load_weights("actormodel2.h5")
    #     # critic.model.load_weights("criticmodel2.h5")
    #     # actor.target_model.load_weights("actormodel2.h5")
    #     # critic.target_model.load_weights("criticmodel2.h5")
    #     # print("Weight load successfully")
    #      saver.restore(sess, base_dir + "dqn.ckpt")
    #      print("model restored")
    # except:
    #    print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 500) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 500 episode because of the memory leak error
        else:
            ob = env.reset()

        # 0. BUILD THE 4 images.
        s_t = np.hstack((ob.img))
        s_t_four_images_list = []
        for j in range(4):
            s_t_four_images_list.append(np.zeros((128, 128), dtype=np.float64))
        s_t_phi = get_phi_from_four_images(s_t_four_images_list)

        total_reward = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])

            noise_t = np.zeros([1, action_dim])

            # 2 EVALUATE the first image
            a_t_original_dqn_discrete = dqn.inference(s_t_phi)
            #a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            # 2.5 TRANSFORM from discrete to continuous.
            a_t_original_dqn = from_discrete_actions_to_continuous_actions(
                a_t_original_dqn_discrete, num_actions_steering,
                num_actions_acceleration, num_actions_break)
            print("actions: ", a_t_original_dqn)

            # a_t_original[0][0] steering: [-1, 1]
            # noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original_dqn[0], 0.0, 0.60, 0.30)
            # a_t_original[0][1] acceleration: [0, 1]. discretize in 6.
            # noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original_dqn[1], 0.5, 1.00, 0.10)
            # a_t_original[0][2] break: [0, 1]
            # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original_dqn[2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            if random.random() <= 0.05:
                print("********Now we apply the brake***********")
                #noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)
                noise_t[0][2] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original_dqn[2], 0.2, 1.00,
                                              0.10)

            # a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            # a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            # a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            a_t[0][0] = a_t_original_dqn[0] + noise_t[0][0]
            a_t[0][1] = a_t_original_dqn[1] + noise_t[0][1]
            a_t[0][2] = a_t_original_dqn[2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            # 0. UPDATE THE LAST FOUR IMAGES
            s_t1 = np.hstack((ob.img))
            if len(s_t_four_images_list) >= 4:
                s_t_four_images_list.pop(0)
                image = np.reshape(ob.img, (128, 128))
                s_t_four_images_list.append(image)

                # print greyscale image
                # plt.imshow(image, origin='lower')
                # plt.draw()
                # plt.pause(0.001)
            #get phi for the new observed state
            s_t1_phi = get_phi_from_four_images(s_t_four_images_list)

            # Add replay buffer
            #buff.add(s_t, a_t[0], r_t, s_t1, done)
            buff.add(s_t_phi,
                     from_continuous_actions_to_discrete_actions(
                         a_t[0], num_actions_steering,
                         num_actions_acceleration, num_actions_break), r_t,
                     s_t1_phi, done)  # Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            # states = np.asarray([e[0] for e in batch])
            # actions = np.asarray([e[1] for e in batch])
            # rewards = np.asarray([e[2] for e in batch])
            # new_states = np.asarray([e[3] for e in batch])
            # dones = np.asarray([e[4] for e in batch])
            # y_t = np.asarray([e[1] for e in batch])
            states = [e[0] for e in batch]
            states = np.concatenate(states, axis=0)
            actions = [e[1] for e in batch]
            rewards = [e[2] for e in batch]
            new_states = [e[3] for e in batch]
            new_states = np.concatenate(new_states, axis=0)
            dones = [e[4] for e in batch]
            y_t = [e[1] for e in batch]

            # 3. TRAINING
            #target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])
            # for k in range(len(batch)):
            #     if dones[k]:
            #         y_t[k] = rewards[k]
            #     else:
            #         y_t[k] = rewards[k] + GAMMA*target_q_values[k]

            if (train_indicator):
                # 4 TRAIN
                loss = dqn.train(s_t=states,
                                 s_t1=new_states,
                                 rewards=rewards,
                                 actions=actions,
                                 terminals=dones,
                                 stepNumber=step)

                # loss += critic.model.train_on_batch([states,actions], y_t)
                # a_for_grad = actor.model.predict(states)
                # grads = critic.gradients(states, a_for_grad)
                # actor.train(states, grads)
                # actor.target_train()
                # critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)
            esar = (i, step, a_t, r_t, loss)
            esar2.append(esar)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                # print("Now we save model")
                # actor.model.save_weights("actormodelIMG.h5", overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)
                #
                # critic.model.save_weights("criticmodelIMG.h5", overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)
                save_path = saver.save(sess, base_dir + "dqn.ckpt")
                print("Model saved in file: %s" % save_path)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

        esar3 = (i, step, total_reward)
        esar4.append(esar3)

        def save_object(obj, filename):
            with open(filename, 'w+b') as output:
                pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

        save_object(esar2, 'IntraEpisode.pkl')
        save_object(esar4, 'InterEpisode.pkl')

    env.end()  # This is for shutting down TORCS
    print("Finish.")
    print("Saving esars.")
Esempio n. 21
0
File: main.py Progetto: fdhcg/rl

class My_Env(Env):
    def __init__(self, p):
        super().__init__(p)
        self.step_value = ACTIONSTEP
        self.maxlen = MAXLEN


if __name__ == "__main__":
    env = My_Env(P)
    RL = DeepQNetwork(
        env.action_space,
        env.observation_space,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.95,
        replace_target_iter=500,
        memory_size=1000,
        # output_graph=True,
        e_greedy_increment=0.01)

    step = 0
    for i in range(EPISODE):
        accu_reward = 0
        env.reset()
        env.step(np.random.randint(0, env.action_space))
        observation = env.observation()
        while 1:

            action = RL.choose_action(observation)
Esempio n. 22
0
            # 前200个step为随机探索,之后开始学习?
            if (step > 200) and (step % 5 == 0):
                RL.learn()

            # 进入下一状态
            state = state_

            # 结束此回合
            if done:
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":

    # maze game
    env = Maze()

    RL = DeepQNetwork(env.n_actions)

    env.after(100, update)

    env.mainloop()

    RL.plot_cost()
Esempio n. 23
0
    n_action = 11
    n_width = 8
    n_height = 3
    n_channel = 1

    n_episode = 1000
    e_greedy_increment = 0.001
    learning_rate = 0.005
    memory_size = 3000
    dueling = True
    prioritized = True
    double_q = True

    dqn = DeepQNetwork(n_action, n_width, n_height, n_channel, e_greedy_increment=e_greedy_increment, \
        memory_size=memory_size, \
        learning_rate=learning_rate, dueling=dueling, prioritized=prioritized, double_q=double_q)
    # dqn.load(372)

    counter = 0
    state = deque([], maxlen=n_width)
    state_ = deque([], maxlen=n_width)

    for i in range(n_episode):

        observation = env.reset()
        state_.append(observation)
        # observation = np.identity(16)[observation:observation + 1]
        # observation = np.expand_dims(observation, axis=2)
        # observation = np.expand_dims(observation, axis=3)
Esempio n. 24
0
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake

    state_dim = 512

    #of sensors input

    np.random.seed(61502)

    vision = True

    EXPLORE = 100000.
    episode_count = 600000
    max_steps = 1800
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0
    esar2 = []
    esar4 = []

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    #We insert the Deep Q Image Processing Module
    args = {
        'save_model_freq': 10000,
        'target_model_update_freq': 10000,
        'normalize_weights': True,
        'learning_rate': .00025,
        'model': None
    }

    # print(args["save_model_freq"])

    C = DeepQNetwork(state_dim, sess, '/home/lou/DDPG-Keras-Torcs', args=args)
    # print(C)

    x, h_fc1 = C.buildNetwork('test', trainable=True, numActions=1)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodelIMG.h5")
        critic.model.load_weights("criticmodelIMG.h5")
        actor.target_model.load_weights("actormodelIMG.h5")
        critic.target_model.load_weights("criticmodelIMG.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 500) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 500 episode because of the memory leak error
        else:
            ob = env.reset()

        imgfinal = np.zeros((1, 480, 640, 12), dtype=np.int32)
        s_t = C.getFC7(imgfinal)

        # print('ST FIRST', s_t)
        # print('STSHAPE FIRST', np.shape(s_t))

        total_reward = 0.

        imglst = []
        speed = 0
        stepreset = 1

        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])

            noise_t = np.zeros([1, action_dim])

            # a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            a_t_original = actor.model.predict(C.getFC7(imgfinal))
            # print('ATORIGINAL', a_t_original)
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            if random.random() <= 0.05:
                print("********Now we apply the brake***********")
                noise_t[0][2] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00,
                                              0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            # print('GTD1SUM:', np.sum(generate_training_data(config=MyConfig)))

            # print('GTD SHAPE', np.shape(generate_training_data(config=MyConfig)))
            imglst.append(generate_training_data(config=MyConfig))

            if len(imglst) == 4:
                imgcopy = imglst[:]
                imgfinal = np.stack(imgcopy)
                #print("Original stacked matrix", imgfinal)

                imgfinal = np.reshape(imgfinal, (4, 480, 640, 3))
                #print("Reshaped stacked matrix", imgfinal)

                #Switch 3 and 0 if you want to switch RGB or Batch
                imgfinal = np.transpose(imgfinal, (1, 2, 3, 0))
                #print("Transposed stacked matrix", imgfinal)

                imgfinal = np.reshape(imgfinal, (1, 480, 640, 12))
                #print("Shape of imgfinal", imgfinal.shape)

            s_t1 = C.getFC7(imgfinal)

            #print('STL', s_t1)
            #print('STLSHAPE', np.shape(s_t1))
            #print('IMGFINAL', imgfinal)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])

            # print('STATESSHAPE1', states)
            # print('SUMARRAY', np.sum(states))

            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            # print('NEW STATES', new_states)

            # target_q_values = critic.target_model.predict([C.getFC7(imgfinal), actor.target_model.predict(C.getFC7(imgfinal))])

            # print('ACTOR TARGET MODEL PREDICT', C.getFC7(imgfinal))
            new_states = np.reshape(new_states, (-1, state_dim))

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])
            # print('TARGET Q VALUES', target_q_values)
            # print('NEW STATES', new_states)
            # print('ACTOR MODEL PREDICT NEW STATES', actor.target_model.predict(new_states))
            # print('REWARDS', rewards)

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                states = np.reshape(states, (-1, state_dim))
                # print('STATESSHAPE2', np.shape(states))
                # print('ACTIONSSHAPE', np.shape(actions))
                # print('YT', np.shape(y_t))

                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1
            speed += ob.speedX * 300
            speedavg = speed / stepreset
            #print("SPEED X", ob.speedX)

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss, "Average Speed", speedavg)
            esar = (i, step, a_t, r_t, loss, speedavg)
            esar2.append(esar)

            step += 1
            stepreset += 1

            if len(imglst) >= 4:
                del imglst[0]

            # print("Length of imglist", len(imglst))
            # print("List itself", imgfinal)

            if done:
                break

        if np.mod(i, 50) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodelIMG.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodelIMG.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

        esar3 = (i, step, total_reward, speedavg)
        esar4.append(esar3)

        if np.mod(i, 50) == 0:
            save_object(esar2, 'IntraEpisode.pkl')
            save_object(esar4, 'InterEpisode.pkl')

    env.end()  # This is for shutting down TORCS
    print("Finish.")
    print("Saving esars.")
Esempio n. 25
0
n_actions = 24
# one state consists of 18 dimesions
# 0-7:   motor signal
# 8:     action
# 9-14:  imu data
# 15-17: position
# n_features = [s_t, s_t-1, s_t-2, s_t-3, s_t-4]
n_features = 90
EPISODE = 1000
TIMESTEP = 1000

if __name__ == "__main__":

    env = SpyndraEnv()
    print "Start model initialization..."
    RL = DeepQNetwork(
        n_actions,
        n_features,
        learning_rate=0.1,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=200,
        memory_size=4000,
        # output_graph=True
    )
    print "Model initialization complete"
    #env._render()
    run(EPISODE, TIMESTEP)

    #RL.plot_cost()
Esempio n. 26
0
class DQNAgent(Agent):
    '''
    Agent based on Deep Q-Network Agent (DQN)
    '''
    def __init__(self, *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        # define Q-evaluation network and target Q-network for the agent
        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        # we will never perform gradient descent or backpropagation with Q next network
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        # zero out previous gradient calculations
        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        # calculate Q-predicted and Q-target values (gives action values for batch states)
        '''
          dims --> batch_size x n_actions
          what the target network has to say about the values of the new states that results from the agent's actions.
          we want to know what are teh values of the maximal actions for that articular set of states.
          we find that by taking the max along the action dimension 
        '''
        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(states_).max(
            dim=1)[0]  # 0 max value, 1 index of max value

        # done flag as a type of mask
        q_next[dones] = 0.0
        q_target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()  # backprogate the loss
        self.q_eval.optimizer.step()  # step the optimizer to update weight
        self.learn_step_counter += 1  # do this to remember to update target network to the right frequency
Esempio n. 27
0
class DDQNAgent(Agent):
    '''
    Agent based on Double Deep Q-Nework (Double-DQN)
    '''
    def __init__(self, *args, **kwargs):
        super(DDQNAgent, self).__init__(*args, **kwargs)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        # zero out previous gradient calculations
        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()

        indices = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(states_)
        q_eval = self.q_eval.forward(states_)

        max_actions = T.argmax(q_eval, dim=1)
        # done flag as a type of mask
        q_next[dones] = 0.0

        q_target = rewards + self.gamma * q_next[indices, max_actions]
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()  # backprogate the loss

        self.q_eval.optimizer.step()  # step the optimizer to update weight
        self.learn_step_counter += 1  # do this to remember to update target network to the right frequency

        self.decrement_epsilon()
Esempio n. 28
0
class DQNAgent:
    def __init__(self, inputs, n_actions):
        self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain.load_state_dict(self.brain.state_dict())
        self.target_brain.eval()

        self.set_params()
        self.optimizer = torch.optim.Adam(self.brain.parameters())
        self.memory = ReplayMemory(50000)
        self.action_space = [0, 1]

    def set_params(self):
        self.batch_size = 64

        self.max_exploration_rate = 1
        self.min_exploration_rate = 0.05
        self.exploration_decay_rate = 0.0005

        self.steps_done = 0

    def select_action(self, state):
        sample = np.random.random()
        exploration_rate = self.min_exploration_rate + (
            self.max_exploration_rate - self.min_exploration_rate) * np.exp(
                -self.steps_done * self.exploration_decay_rate)

        self.steps_done += 1
        if sample > exploration_rate:
            with torch.no_grad():
                actions = self.brain(state)
                return torch.argmax(actions).item()
        else:
            return np.random.choice(self.action_space)

    def learn(self):
        if len(self.memory) < self.batch_size:
            return

        self.optimizer.zero_grad()

        max_capacity = (len(self.memory)
                        if len(self.memory) < self.memory.capacity else
                        self.memory.capacity)

        batch = np.random.choice(max_capacity, self.batch_size)

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(
            tuple(map(lambda s: s is not None, batch.next_state)),
            dtype=torch.bool,
        )
        non_final_next_states = torch.tensor(
            [s for s in batch.next_state if s is not None])

        state_batch = torch.tensor(batch.state)
        action_batch = torch.tensor(batch.action)
        reward_batch = torch.tensor(batch.reward, dtype=torch.float)

        state_action_values = self.brain(state_batch).gather(
            1, action_batch.unsqueeze(-1))

        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_brain(
            non_final_next_states).max(1)[0]

        gamma = 0.99
        expected_state_action_values = (gamma * next_state_values +
                                        reward_batch / reward_batch.max())

        self.loss = torch.nn.MSELoss()(
            expected_state_action_values.unsqueeze(-1), state_action_values)

        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
Esempio n. 29
0
class DQNAgent:
    def __init__(self):
        # Our Main Model: POLICY NETWORK
        self.dqn = DeepQNetwork(model_name=MODEL_NAME,
                                input_dim=INPUT_DIM,
                                n_actions=N_ACTIONS,
                                layer1_units=LAYER1_UNITS,
                                layer2_units=LAYER2_UNITS,
                                lr=LEARNING_RATE)

        self.model = self.dqn.create_model()

        # TARGET NETWORK
        self.target_model = self.dqn.create_model()

        self.target_model.set_weights(self.model.get_weights())

        # An array with last n steps for training
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)

        # Tensorboard for logging results
        self.tensorboard = ModifiedTensorBoard(
            log_dir=f'../logs/{MODEL_NAME}-{int(time.time())}')

        # target update counter
        self.target_update_counter = 0

    def update_replay_memory(self, transition):
        ''' To update the replay with the steps' experience '''
        self.replay_memory.append(transition)

    def get_q_values(self, state):
        ''' Get the Q values (learned or thus far) '''
        return self.model.predict(np.array(state).reshape(-1, *state.shape))[0]

    def train(self, terminal_state, step):
        ''' This is where we actually train the Agent '''

        # Start training only if certain number of samples is already saved in REPLAY MEMORY
        # Else it keeps making steps which are added to the REPLAY MEM
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return

        # Get a minibatch of random samples from replay memory
        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch])
        current_qs_list = self.model.predict(current_states)

        # Get future states from minibatch, then query Target NN model for Q values
        # Computing the max term in Bellman Equation
        new_current_states = np.array(
            [transition[3] for transition in minibatch])
        future_qs_list = self.target_model.predict(new_current_states)

        X = []
        y = []

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state,
                    done) in enumerate(minibatch):

            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs = current_qs_list[index]
            current_qs[action] = new_q

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)

        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(
            np.array(X),
            np.array(y),
            batch_size=MINIBATCH_SIZE,
            verbose=0,
            shuffle=False,
            callbacks=[self.tensorboard] if terminal_state else None)

        # Update target network counter every episode
        if terminal_state: self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0
Esempio n. 30
0
HIDDENS = [128]

network = Header(inpt_shape=env.observation_space.shape,
                 hiddens=HIDDENS,
                 opt_size=env.action_space.n,
                 network=MLP,
                 dueling=args.dueling)
target_network = Header(inpt_shape=env.observation_space.shape,
                        hiddens=HIDDENS,
                        opt_size=env.action_space.n,
                        network=MLP,
                        dueling=args.dueling) if args.double else None

dqn = DeepQNetwork(network=network,
                   memory_size=MEMORY_SIZE,
                   use_double_dqn=args.double,
                   target_network=target_network,
                   dueling=args.dueling,
                   prioritized=args.prioritized)

# ==== train & test ====
# choose one of the two pipelines
if env_id == 0:
    train_pipeline_conservative(env,
                                dqn,
                                score_threshold,
                                n_epoch=500,
                                n_rollout=100,
                                n_train=1000,
                                batch_size=256)
if env_id == 1 or env_id == 2:
    train_pipeline_progressive(env,