Ejemplo n.º 1
0
            n_completed_episodes += 1
        epi_step = 0
        nepisodes += 1
        # reset the game
        state = sim.newGame(opt.tgt_y, opt.tgt_x)

        # and reset the history
        state_with_history[:] = 0
        append_to_hist(state_with_history,
                       rgb2gray(state.pob).reshape(opt.state_siz))
        next_state_with_history = np.copy(state_with_history)

    action = np.argmax(
        agent.predict(
            state_with_history.reshape(-1, img_rows, img_cols, opt.hist_len)))
    action_onehot = trans.one_hot_action(action)
    next_state = sim.step(action)
    # append to history
    append_to_hist(next_state_with_history,
                   rgb2gray(next_state.pob).reshape(opt.state_siz))

    # mark next state as current state
    state_with_history = np.copy(next_state_with_history)
    state = next_state

    # accumulate reward
    accumulated_reward += state.reward
    accumulated_reward_list.append(accumulated_reward)

    if opt.disp_on:
        if win_all is None:
Ejemplo n.º 2
0
    def start_eval(self, speicherort, display):
        # 0. initialization
        self.opt = Options()
        sim = SimulatorDeterministicStart(self.opt.map_ind, self.opt.cub_siz,
                                          self.opt.pob_siz, self.opt.act_num)
        imported_meta = tf.train.import_meta_graph(speicherort)

        win_all = None
        win_pob = None

        def_graph = tf.get_default_graph()

        with tf.Session() as sess:
            with tf.variable_scope("new_testing_scope", reuse=tf.AUTO_REUSE):

                x = sess.graph.get_tensor_by_name('x:0')
                Q = tf.get_collection("Q")[0]

                imported_meta.restore(sess,
                                      tf.train.latest_checkpoint('./weights/'))

                maxlen = 100000

                # initialize the environment
                state = sim.newGame(self.opt.tgt_y, self.opt.tgt_x, 0)
                state_with_history = np.zeros(
                    (self.opt.hist_len, self.opt.state_siz))
                self.append_to_hist(
                    state_with_history,
                    rgb2gray(state.pob).reshape(self.opt.state_siz))
                next_state_with_history = np.copy(state_with_history)
                trans = TransitionTable(self.opt.state_siz, self.opt.act_num,
                                        self.opt.hist_len,
                                        self.opt.minibatch_size, maxlen)
                epi_step = 0

                episodes = 0

                solved_epoisodes = 0

                step_sum = 0
                # train for <steps> steps
                while True:

                    # goal check
                    if state.terminal or epi_step >= self.opt.early_stop:
                        if state.terminal:
                            solved_epoisodes += 1
                        episodes += 1
                        step_sum = step_sum + epi_step
                        epi_step = 0

                        # reset the game
                        try:
                            state = sim.newGame(self.opt.tgt_y, self.opt.tgt_x,
                                                episodes)
                        except:
                            return (step_sum, solved_epoisodes)

                        # and reset the history
                        state_with_history[:] = 0
                        self.append_to_hist(
                            state_with_history,
                            rgb2gray(state.pob).reshape(self.opt.state_siz))
                        next_state_with_history = np.copy(state_with_history)

                        if display:
                            if win_all is None:
                                plt.subplot(121)
                                win_all = plt.imshow(state.screen)
                                plt.subplot(122)
                                win_pob = plt.imshow(state.pob)
                            else:
                                win_all.set_data(state.screen)
                                win_pob.set_data(state.pob)
                            plt.pause(self.opt.disp_interval)
                            plt.draw()

                    epi_step += 1

                    # format state for network input
                    input_reshaped = self.reshapeInputData(
                        state_with_history, 1)
                    # create batch of input state
                    input_batched = np.tile(input_reshaped,
                                            (self.opt.minibatch_size, 1, 1, 1))

                    ### take one action per step
                    qvalues = sess.run(Q, feed_dict={
                        x: input_batched
                    })[0]  # take the first batch entry
                    action = np.argmax(qvalues)
                    action_onehot = trans.one_hot_action(action)
                    # apply action
                    next_state = sim.step(action)
                    # append to history
                    self.append_to_hist(
                        next_state_with_history,
                        rgb2gray(next_state.pob).reshape(self.opt.state_siz))
                    # add to the transition table
                    trans.add(state_with_history.reshape(-1), action_onehot,
                              next_state_with_history.reshape(-1),
                              next_state.reward, next_state.terminal)
                    # mark next state as current state
                    state_with_history = np.copy(next_state_with_history)
                    state = next_state

                    if display:
                        if win_all is None:
                            plt.subplot(121)
                            win_all = plt.imshow(state.screen)
                            plt.subplot(122)
                            win_pob = plt.imshow(state.pob)
                        else:
                            win_all.set_data(state.screen)
                            win_pob.set_data(state.pob)
                        plt.pause(self.opt.disp_interval)
                        plt.draw()
Ejemplo n.º 3
0
for step in range(steps):
    if state.terminal or epi_step >= opt.early_stop:
        epi_step = 0
        nepisodes += 1
        # reset the game
        state = sim.newGame(opt.tgt_y, opt.tgt_x)
        # and reset the history
        state_with_history[:] = 0
        append_to_hist(state_with_history,
                       rgb2gray(state.pob).reshape(opt.state_siz))
        next_state_with_history = np.copy(state_with_history)

    # Take action with highest Q-value
    action = np.argmax(
        agent.predict(sess, state_with_history.T[np.newaxis, ..., np.newaxis]))
    action_onehot = trans.one_hot_action(action)
    next_state = sim.step(action)
    # append to history
    append_to_hist(next_state_with_history,
                   rgb2gray(next_state.pob).reshape(opt.state_siz))
    # add to the transition table
    trans.add(state_with_history.reshape(-1), action_onehot,
              next_state_with_history.reshape(-1), next_state.reward,
              next_state.terminal)
    # mark next state as current state
    state_with_history = np.copy(next_state_with_history)
    state = next_state

    #Training
    #Get batch
    state_batch, action_batch, next_state_batch, reward_batch, terminal_batch = trans.sample_minibatch(
def play(args):
    # 0. initialization
    sim = Simulator(opt.map_ind, opt.cub_siz, opt.pob_siz, opt.act_num)
    model = model_create()

    #continue training from a previous model
    ##model.load_weights(opt.weights_fil)

    # setup a transition table that is filled during training
    maxlen = opt.early_stop
    ##print('weights loaded to the model')
    trans = TransitionTable(opt.state_siz, opt.act_num, opt.hist_len, maxlen)
    if args.mode == "train":
        print "Training mode"

        if opt.disp_on:
            win_all = None
            win_pob = None

        epi_step = 0
        nepisodes = 0

        state = sim.newGame(opt.tgt_y, opt.tgt_x)
        state_with_history = np.zeros((opt.hist_len, opt.state_siz))
        append_to_hist(state_with_history,
                       rgb2gray(state.pob).reshape(opt.state_siz))
        next_state_with_history = np.copy(state_with_history)
        loss = 0.
        reward_acc = 0.

        loss_list = []
        reward_acc_list = []
        epi_step_list = []
        reward_acc_new = []
        reward_acc_track = 0
        start = timer()
        #Training
        for step in range(steps):

            if state.terminal or epi_step >= opt.early_stop:
                state_batch, action_batch = trans.sample_minibatch(epi_step)
                state_batch = state_batch.reshape(epi_step, img_rows, img_cols,
                                                  opt.hist_len)
                reward_sample_weight = np.zeros(
                    (epi_step, ), dtype=np.float32) + reward_acc
                loss = model.train_on_batch(state_batch,
                                            action_batch,
                                            sample_weight=reward_sample_weight)
                print('Episode %d, step %d, total reward %.5f, loss %.8f' %
                      (nepisodes, epi_step, reward_acc, loss))

                # keep track of these values
                epi_step_list.append(epi_step)
                reward_acc_list.append(reward_acc)
                ##                loss_list.append(loss)

                epi_step = 0
                nepisodes += 1
                # reset the game
                state = sim.newGame(opt.tgt_y, opt.tgt_x)
                # and reset the history
                state_with_history[:] = 0
                append_to_hist(state_with_history,
                               rgb2gray(state.pob).reshape(opt.state_siz))
                next_state_with_history = np.copy(state_with_history)
                reward_acc = 0
                loss = 0
                trans = TransitionTable(opt.state_siz, opt.act_num,
                                        opt.hist_len, maxlen)

            #Save the weights every now and then
            if (((step + 1) % 1000000) == 0):
                model.save_weights(opt.weights_fil, overwrite=True)
                print('Saved weights')
                with open(opt.network_fil, "w") as outfile:
                    json.dump(model.to_json(), outfile)

            epi_step += 1
            #sample an action from the policy network
            action = np.argmax(
                model.predict(
                    (state_with_history).reshape(1, img_rows, img_cols,
                                                 opt.hist_len)))

            #one hot encoding
            action_onehot = trans.one_hot_action(action)

            #Take next step in the environment according to the action selected
            next_state = sim.step(action)

            # append state to history
            append_to_hist(next_state_with_history,
                           rgb2gray(next_state.pob).reshape(opt.state_siz))

            #add to the transition table
            trans.add(state_with_history.reshape(-1), action_onehot)
            # mark next state as current state
            state_with_history = np.copy(next_state_with_history)
            state = next_state
            reward_acc += state.reward
            reward_acc_track += state.reward
            reward_acc_new.append(reward_acc_track)
            print "Total Steps:", step
            print('Episode %d, step %d, action %d, reward %.5f' %
                  (nepisodes, epi_step, action, state.reward))

            if opt.disp_on:
                if win_all is None:
                    plt.subplot(121)
                    win_all = plt.imshow(state.screen)
                    plt.subplot(122)
                    win_pob = plt.imshow(state.pob)
                else:
                    win_all.set_data(state.screen)
                    win_pob.set_data(state.pob)
                plt.pause(opt.disp_interval)
                plt.draw()

        end = timer()
        sec = int(end - start)
        hours = int(sec / 3600)
        rem = int(sec - (hours * 3600))
        mins = rem / 60
        rem = rem - (mins * 60)
        secs = rem

        print 'Training time:', hours, ':', mins, ':', secs

        with open('episode_steps', 'wb') as f:
            pickle.dump(epi_step_list, f)
            print 'saved episode steps'

        with open('accum_reward_episodes', 'wb') as f:
            pickle.dump(reward_acc_list, f)
            print 'saved accumulated reward for each episode'


##        with open('loss','wb') as f:
##            pickle.dump(loss_list,f)
##            print 'saved losses'

        with open('accum_reward_steps', 'wb') as f:
            pickle.dump(reward_acc_new, f)
            print 'saved accumulated reward for all steps'

        #Save the weights
        model.save_weights(opt.weights_fil, overwrite=True)
        print('Saved weights')
        with open(opt.network_fil, "w") as outfile:
            json.dump(model.to_json(), outfile)

    ### run
    if args.mode == 'run':

        print "Running mode"
        model.load_weights(opt.weights_fil)
        print('weights loaded to the model')
        opt.disp_on = True
        win_all = None
        win_pob = None
        state = sim.newGame(opt.tgt_y, opt.tgt_x)
        state_with_history = np.zeros((opt.hist_len, opt.state_siz))
        append_to_hist(state_with_history,
                       rgb2gray(state.pob).reshape(opt.state_siz))
        next_state_with_history = np.copy(state_with_history)
        epi_step = 0
        nepisodes = 0
        n_reached = 0.0
        reward_acc_test = 0
        reward_acc_list_test = []

        print('Test Phase')
        for test_step in range(test_steps):

            if state.terminal or epi_step > opt.early_stop:
                if (state.terminal):
                    print 'Episode:', (nepisodes + 1), 'agent reached'
                    n_reached += 1
                else:
                    print 'Episode:', (nepisodes + 1), 'agent failed'
                epi_step = 0
                nepisodes += 1
                # reset the game
                state = sim.newGame(opt.tgt_y, opt.tgt_x)
                # and reset the history
                state_with_history[:] = 0
                append_to_hist(state_with_history,
                               rgb2gray(state.pob).reshape(opt.state_siz))
                next_state_with_history = np.copy(state_with_history)

            epi_step += 1
            action = np.argmax(
                model.predict(
                    (state_with_history).reshape(1, img_rows, img_cols,
                                                 opt.hist_len)))
            action_onehot = trans.one_hot_action(action)
            #Take next step according to the action selected
            next_state = sim.step(action)
            # append state to history
            append_to_hist(next_state_with_history,
                           rgb2gray(next_state.pob).reshape(opt.state_siz))

            # mark next state as current state
            state_with_history = np.copy(next_state_with_history)
            state = next_state
            reward_acc_test += state.reward

            if opt.disp_on:
                if win_all is None:
                    plt.subplot(121)
                    win_all = plt.imshow(state.screen)
                    plt.subplot(122)
                    win_pob = plt.imshow(state.pob)
                else:
                    win_all.set_data(state.screen)
                    win_pob.set_data(state.pob)
                plt.pause(opt.disp_interval)
                plt.draw()
        print 'Agent reached the target', n_reached, 'from', nepisodes, 'episodes', '(', (
            n_reached / nepisodes) * 100, '%)'
Ejemplo n.º 5
0
    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # this just gets a random action

    if np.random.uniform() * 100 > 20:
        flattened_state_with_history = np.float32(
            state_with_history.reshape(1, opt.hist_len * opt.state_siz))

        action_tf = tf.argmax(input=Q_s, axis=1)
        action = sess.run(action_tf,
                          feed_dict={x: flattened_state_with_history})

        action = action[0]
    else:
        action = randrange(opt.act_num)

    action_onehot = trans.one_hot_action(action)
    next_state = sim.step(action)
    epi_step += 1
    # append to history
    append_to_hist(next_state_with_history,
                   rgb2gray(next_state.pob).reshape(opt.state_siz))
    # add to the transition table
    trans.add(state_with_history.reshape(-1), action_onehot,
              next_state_with_history.reshape(-1), next_state.reward,
              next_state.terminal)
    # mark next state as current state
    state_with_history = np.copy(next_state_with_history)
    state = next_state
    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # TODO: here you would train your agent
    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Ejemplo n.º 6
0
    if nepisodes <= FULL_RANDOM_EPISODES:
        action = randrange(opt.act_num)  #TODO
    else:
        if use_conv:
            action = agent.act(np.array([state_with_history]))
        else:
            action = agent.act(np.array([state_with_history.reshape(-1)]))

    epi_step += 1
    next_state = sim.step(action)
    # append to history
    append_to_hist(next_state_with_history,
                   rgb2gray(next_state.pob).reshape(opt.state_siz))
    # add to the transition table
    trans.add(state_with_history.reshape(-1), trans.one_hot_action(action),
              next_state_with_history.reshape(-1), next_state.reward,
              next_state.terminal)

    # mark next state as current state
    state_with_history = np.copy(next_state_with_history)
    episode_reward += next_state.reward
    state = next_state

    if nepisodes > FULL_RANDOM_EPISODES:
        agent.train(trans.sample_minibatch())

    if opt.disp_on and disp_progress:

        if win_all is None:
            plt.subplot(121)