def loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)

    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    # observation_space = env.observation_space.shape[0]

    run = 0
    while True:
        run += 1
        state = env.reset()
        step = 0
        while True:
            step += 1
            env.render()

            #TODO RUN PI ADJUST
            action = utils.policy(env, pilco, state, False)
            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(action)
            # reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
def plot_pilco_source_learning_curve():
    env = gym.make('continuous-cartpole-v0')
    env.seed(73)

    pilcos = ['initial'] + [str(i) for i in range(6)]

    rewards = []
    for i, p in enumerate(pilcos):
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=bf,
                                   max_action=max_action)
        R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
        pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p),
                           controller=controller,
                           reward=R,
                           sparse=False)

        score_logger = ScoreLogger('Score for Model {:d}'.format(i))
        state = env.reset()
        step = 0

        xs = []
        angles = []

        while True:
            xs.append(state[0])
            angles.append(state[2])
            step += 1

            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))

            state_next, reward, terminal, info = env.step(u_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print('Run: {:d}, score: {:d}'.format(i, step))
                score_logger.add_score(step, i)
                break

        rewards.append(step)

        plt.plot(xs, angles)
        plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight")
        plt.close()

    env.close()

    plt.plot([i for i, _ in enumerate(pilcos)], rewards)
    plt.savefig('pilco_rewards_plot', bbox_inches="tight")
    plt.close()

    return rewards, xs, angles
Example #3
0
def cartpole():
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state,
                           [1, observation_space])  # Reshape BY REFERENCE !
        step = 0
        while True:
            step += 1
            action = dqn_solver.act(state)
            state_next, reward, terminal, _ = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(
                state_next, [1, observation_space])  # Reshape BY REFERENCE !
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " +
                      str(dqn_solver.exploration_rate) + ", score: " +
                      str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()
def msPacman():
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    print('start', dqn_solver.exploration_rate)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [480, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state)
            state_next, reward, done, info = env.step(action)
            reward = reward if not done else -reward
            state_next = np.reshape(state_next, [480, observation_space])
            dqn_solver.remember(state, action, reward, state_next, done)
            state = state_next
            if done:
                print("Run: " + str(run) + ", exploration: " +
                      str(dqn_solver.exploration_rate) + ", score: " +
                      str(step))
                score_logger.add_score(step, run, dqn_solver.exploration_rate)
                print(step, run)
                dqn_solver.updateExploration_rate()
                break
            dqn_solver.experience_replay()
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5',
                    transfer_name='{:d}true_dyn_pi_adj.pkl',
                    adjust=True):
    env = gym.make('continuous-cartpole-v99')
    env.seed(1)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco(pilco_name,
                       controller=controller,
                       reward=R,
                       sparse=False)

    rewards = []

    for i in range(10):
        print('Running {:s}'.format(transfer_name.format(i)))
        if adjust:
            with open(transfer_name.format(i), 'rb') as inp2:
                pi_adjust = pickle.load(inp2)

        score_logger = ScoreLogger('Score for Model {:d}'.format(i))
        state = env.reset()
        step = 0
        while True:
            step += 1

            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))

            if adjust:
                pi_adjust_action = pi_adjust.predict(
                    np.array(a).reshape(1, -1))[0]
            else:
                pi_adjust_action = 0  # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT

            state_next, reward, terminal, info = env.step(u_action +
                                                          pi_adjust_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print('Run: {:d}, score: {:d}'.format(i, step))
                score_logger.add_score(step, i)
                break

        rewards.append(step)

    env.close()
    return rewards
Example #6
0
def souce_loader(name):
    Rs = np.empty(10).reshape(1, 10)
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    env = gym.make('continuous-cartpole-v99')

    pi_adjust = None

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    run = 0
    avg_reward = 0
    while run != 101:
        run += 1
        if (run % 20 == 0):
            print('run:  ', run)
        state = env.reset()
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            # env.render()

            # TODO RUN PI ADJUST
            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(u_action)
            reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                # print("Run: "  + ", score: " + str(step))
                score_logger.add_score(step, run)
                avg_reward = avg_reward + step
                break
    avg_reward = avg_reward / run
    env.env.close()
    return (avg_reward)
def true_loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    with open('9true_dyn_pi_adj.pkl', 'rb') as inp2:
        pi_adjust = pickle.load(inp2)

    # with open('10_pi_adj.pkl', 'rb') as inp2:
    #     good_pi = pickle.load(inp2)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    run = 0
    while True:
        run += 1
        state = env.reset()
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))
            action = pi_adjust.predict(np.array(a).reshape(1, -1))[0]

            state_next, reward, terminal, info = env.step(action + u_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print("Run: " + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
Example #8
0
        def initialize():
            # create environment and initial parameters
            self.env = gym.make(self.env_name)
            self.env.seed(self.seed)
            self.env_eval = gym.make(self.env_name)
            self.observation_space_size = self.env.observation_space.shape[0]
            self.action_space_size = self.env.action_space.n
            self.reward_threshold = self.env.spec.reward_threshold
            self.score_max = self.env.spec.max_episode_steps
            self.exploration_rate = self.exloration_max
            self.memory = deque(maxlen=self.memory_size)
            self.tnet_counter = 0
            self.step_counter = 0

            # create ScoreLogger
            self.score_logger = ScoreLogger(self.dir_path, self.window_size, self.reward_threshold)
Example #9
0
def loader():
    with open('CartPole-v1_dqn_solver.pkl', 'rb') as input:
        dqn_solver = pickle.load(input)

    env = gym.make(ENV_NAME)
    env.seed(73)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print(
                    "Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()
        if (run % 50 == 0):
            save_object(dqn_solver, 'v1_in_v99_dqn_solver.pkl')
    save_object(dqn_solver, 'v1_in_v99_dqn_solver.pkl')
    env.env.close()
Example #10
0
def cartpole():
    env = gym.make(ENV_NAME)
    env.seed(73)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = cartpole_agent_dqn(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " +
                      str(dqn_solver.exploration_rate) + ", score: " +
                      str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()
        if (run % 50 == 0):
            save_object(dqn_solver, ENV_NAME + '_' + 'dqn_solver.pkl')

    save_object(dqn_solver, ENV_NAME + '_' + 'dqn_solver.pkl')
    env.env.close()
Example #11
0
def run(solver='static'):
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    run = 0

    if solver == 'static':
        static_solver = StaticSolver()
        while True:
            run += 1
            state = env.reset()
            state = np.reshape(state, [1, observation_space])
            step = 0
            while True:
                step += 1
                env.render()
                action = static_solver.act(state)
                state_next, reward, terminal, info = env.step(action)
                state_next = np.reshape(state_next, [1, observation_space])
                state = state_next
                if terminal:
                    print('Run: ' + str(run) + ', score: ' + str(
                        step))
                    score_logger.add_score(step, run)
                    break
    elif solver == 'dqn':
        action_space = env.action_space.n
        dqn_solver = DQNSolver(observation_space, action_space)
        while True:
            run += 1
            state = env.reset()
            state = np.reshape(state, [1, observation_space])
            step = 0
            while True:
                step += 1
                env.render()
                action = dqn_solver.act(state)
                state_next, reward, terminal, info = env.step(action)
                reward = reward if not terminal else -reward
                state_next = np.reshape(state_next, [1, observation_space])
                dqn_solver.remember(state, action, reward, state_next, terminal)
                state = state_next
                if terminal:
                    print('Run: ' + str(run) + ', exploration: ' + str(dqn_solver.exploration_rate) + ', score: ' + str(
                        step))
                    score_logger.add_score(step, run)
                    break
                dqn_solver.experience_replay()
Example #12
0
class DQNAgent:
    def __init__(self, dir_path=None):

        def initialize():
            # create environment and initial parameters
            self.env = gym.make(self.env_name)
            self.env.seed(self.seed)
            self.env_eval = gym.make(self.env_name)
            self.observation_space_size = self.env.observation_space.shape[0]
            self.action_space_size = self.env.action_space.n
            self.reward_threshold = self.env.spec.reward_threshold
            self.score_max = self.env.spec.max_episode_steps
            self.exploration_rate = self.exloration_max
            self.memory = deque(maxlen=self.memory_size)
            self.tnet_counter = 0
            self.step_counter = 0

            # create ScoreLogger
            self.score_logger = ScoreLogger(self.dir_path, self.window_size, self.reward_threshold)

        if dir_path is None:
            # settings
            self.env_name = ENV_NAME
            self.exloration_max = EXPLORATION_MAX
            self.exploration_min = EXPLORATION_MIN
            self.exploration_decay = EXPLORATION_DECAY
            self.memory_size = MEMORY_SIZE
            self.memory_min = MEMORY_MIN
            self.minibatch_size = MINIBATCH_SIZE
            self.batch_size = BATCH_SIZE
            self.learning_rate = LEARNING_RATE
            self.gamma = GAMMA
            self.window_size = WINDOW_SIZE
            self.seed = SEED
            self.update_target_q_after_n_steps = UPDATE_TARGET_Q_AFTER_N_STEPS
            self.tau = TAU
            self.num_episodes_eval = NUM_EPISODES_EVAL
            self.steps_per_eval = STEPS_PER_EVAL
            self.exploration_rate_eval = EXPLORATION_RATE_EVAL
            self.seed_eval = SEED_EVAL
            self.frames_per_step = FRAMES_PER_STEP

            # create new directory to store settings and results
            run = 0
            while True:
                run += 1
                if not os.path.exists(f"./experiments/{ENV_NAME}_{run}"):
                    self.dir_path = f"./experiments/{ENV_NAME}_{run}"
                    os.mkdir(self.dir_path)
                    break

            # save settings
            with open(os.path.join(self.dir_path, "settings.json"), "w") as file:
                json.dump(self.__dict__, file)

            initialize()
            self.score_logger.log(f"Results of experiments stored in: {self.dir_path}")

            # create model and store model and visualization
            # self.qnet is online model, self.tnet is target model
            self.qnet = Sequential()
            self.qnet.add(Dense(256, input_shape=(self.observation_space_size,), activation='relu'))
            self.qnet.add(Dense(self.action_space_size, activation='linear'))
            self.qnet.compile(loss="huber_loss", optimizer=Adam(learning_rate=self.learning_rate))
            
            self.qnet.save(os.path.join(self.dir_path, "model.HDF5"))
            plot_model(self.qnet, to_file=os.path.join(self.dir_path, "model.png"), show_shapes=True)

            self.tnet = clone_model(self.qnet)
            self.tnet.set_weights(self.qnet.get_weights())
        else:
            with open(os.path.join(dir_path, "settings.json"), "r") as file:
                self.__dict__ = json.load(file)
            
            initialize()

            model_name = "model_best_5840.HDF5"
            self.qnet = load_model(os.path.join(self.dir_path, model_name))
            self.score_logger.log(f"{os.path.join(self.dir_path, model_name)} loaded")

    def train(self):        
        episode = 0
        episode_train = 0
        frame = 0
        temp = True
        while True:
            state = self.env.reset()
            state = np.reshape(state, (1, self.observation_space_size))
            episode += 1
            score = 0
            done = False
            while not done:
                action = self.act(state)
                state_new, reward, done, info = self.env.step(action)
                state_new = np.reshape(state_new, (1, self.observation_space_size))
                score += reward
                frame += 1
                if score >= self.score_max:
                    self.memory.append((state, action, reward, state_new, not done))
                else:
                    self.memory.append((state, action, reward, state_new, done))
                state = state_new
                
                if len(self.memory) >= self.memory_min:
                    if frame % self.frames_per_step == 0:
                        temp = True
                        self.experience_replay()

                    if self.step_counter % self.steps_per_eval == 0 and temp:
                        temp = False
                        self.evaluate()

            if len(self.memory) >= self.memory_min:
                episode_train += 1
                self.score_logger.log(f"\nEpisode: {episode_train} ({episode}), exploration: {self.exploration_rate}, score: {score}")
                self.score_logger.add_score(score, episode, episode_train)
                if episode_train % 64 == 0:
                    self.qnet.save(os.path.join(self.dir_path, "model.HDF5"))
                    self.score_logger.log("Model Saved")
                
                if self.score_logger.save_best_model:
                    self.qnet.save(os.path.join(self.dir_path, "model_best.HDF5"))
                    self.score_logger.save_best_model = False
                    self.score_logger.log("Best model replaced")
                    self.score_logger.solved()
    
    def act(self, state, exploration_rate=None):
        if exploration_rate == None:
            exploration_rate = self.exploration_rate
        if np.random.rand() < exploration_rate:
            return self.env.action_space.sample()
        q_values = self.qnet.predict(state)
        return np.argmax(q_values[0])
    
    def experience_replay(self):
        batch = random.sample(self.memory, self.minibatch_size)
        x = np.zeros((self.minibatch_size, self.observation_space_size))
        y = np.zeros((self.minibatch_size, self.action_space_size))
        for i, (state, action, reward, state_new, done) in enumerate(batch):
            target = self.qnet.predict(state)
            if done:
                target[0][action] = reward
            else:
                target[0][action] = reward + \
                    self.gamma*self.tnet.predict(state_new)[0][np.argmax(self.qnet.predict(state_new)[0])]
            x[i, :] = state[0]
            y[i, :] = target[0]
        self.qnet.fit(x, y, batch_size=self.batch_size, verbose=0)

        if self.tnet_counter >= self.update_target_q_after_n_steps:
            w_qnet = self.qnet.get_weights()
            w_tnet = self.tnet.get_weights()

            for i in range(len(w_tnet)):
                w_tnet[i] = w_qnet[i]*self.tau + w_tnet[i]*(1-self.tau)
            self.tnet.set_weights(w_tnet)
            self.tnet_counter = 0
        self.tnet_counter += 1

        self.exploration_rate = np.amax((self.exploration_rate*self.exploration_decay, self.exploration_min))
        self.step_counter += 1

    def evaluate(self):
        self.env_eval.seed(self.seed_eval)
        scores = []
        for i in range(self.num_episodes_eval):
            state = self.env_eval.reset()
            state = np.reshape(state, (1, self.observation_space_size))
            score = 0
            done = False
            while not done:
                action = self.act(state, self.exploration_rate_eval)
                state, reward, done, info = self.env_eval.step(action)
                state = np.reshape(state, (1, self.observation_space_size))
                score += reward
            scores.append(score)
        self.score_logger.add_evaluation(scores, self.step_counter)
    
    def simulate(self, exploration_rate=0.0, verbose=False):
        state = self.env.reset()
        state = np.reshape(state, (1, self.observation_space_size))
        score = 0
        while True:
            self.env.render()
            action = self.act(state, exploration_rate)
            if verbose:
                with np.printoptions(precision=5, sign=' ', floatmode='fixed', suppress=True):
                    self.score_logger.log(f"State: {state[0]}, Output model: {self.qnet.predict(state)[0]}, Action: {action}, score: {score}")
            state, reward, done, info = self.env.step(action)
            score += reward
            state = np.reshape(state, (1, self.observation_space_size))
            time.sleep(0.02)
            if done:
                self.score_logger.log(f"Episode finished, score: {score}")
                break
        self.env.close()
Example #13
0
def piadjust(NT):
    with open('GOODv1.pkl ', 'rb') as inp:
        dqn_solver = pickle.load(inp)

    env_S = gym.make(ENV_NAME)
    env_S.seed(73)
    score_logger_S = ScoreLogger(ENV_NAME)
    observation_space_S = env_S.observation_space.shape[0]

    env_T = gym.make(ENV_NAMET)
    env_T.seed(73)
    score_logger_T = ScoreLogger(ENV_NAMET)
    observation_space_T = env_T.observation_space.shape[0]

    #TODO IMplement Pi adjust
    D_S = sampler(dqn_solver, env_S, 1000)
    D_S = noiser(D_S, [0, 2])
    print('D_S sampling done')

    D_T = None
    i = 0
    pi_adj = dqn_solver

    while i < NT:
        D_adj = []

        if i == 0:
            D_i_T = sampler(dqn_solver, env_T, 1000)

        elif i != 0:
            D_i_T = sampler_adj(pi_adj, dqn_solver, env_T, 1000)

        if D_T is not None:
            # print(D_i_T.shape, D_T.shape)
            D_T = np.concatenate((D_i_T, D_T))
        elif D_T is None:
            D_T = D_i_T
        print('Goin for inverse dyn')
        gpr = inverse_dyn(D_T)
        print('inverse dyn done')

        for samp in D_S:

            x_s = np.ndarray.tolist(samp[0])[0]
            x_s1 = np.ndarray.tolist(samp[2])[0]
            u_t_S = samp[1]
            # print(u_t_S)

            a = np.ndarray.tolist(samp[0])[0]
            a.extend(np.ndarray.tolist(samp[2])[0])
            # print( np.array(a).reshape(1, 8)  )

            u_t_T = gpr.predict(np.array(a).reshape(1, 8), return_std=False)

            if u_t_T > 0:
                u_t_T = 1

            elif u_t_T < 0:
                u_t_T = 0

            # print('\n\n', dqn_solver.act(  np.array(a[0:4]).reshape([1,4] ) ))
            # print( np.array(a[0:4]).reshape([1,4] )
            # print(i, '    ', D_adj)
            D_adj.append((x_s, u_t_S, u_t_T))

        # print(i, '    ',x_s, u_t_S, u_t_T)
        print('Goin for L3')
        pi_adj = L3(D_adj)
        print('L3 Done')
        # x_s.append(u_t_S)
        # print(pi_adj.predict(np.array(x_s).reshape(1,-1)))
        print(i)
        i = i + 1
        if (i % 1 == 0):
            save_object(pi_adj, str(i) + '_pi_adj.pkl')

    env_S.env.close()
    env_T.env.close()

    return (pi_adj)
Example #14
0
def connect4dqn(folder):
    env = Connect4()
    os.chdir(folder)
    score_logger_random = ScoreLogger('AI_vs_random', average_score_to_solve=1000)
    score_logger_ai = ScoreLogger('AI_vs_{}'.format(EVAL_AI), average_score_to_solve = 11)
    #only 10 games played but scorelogger would (early)stop(ing) when reaching 10 games 10 times in a row --> 11

#    player1won = 0
#    player2won = 0
    observation_space = env.reset().shape
    action_space = env.validMoves().size
    # Assign GPU to DGX
    config = tf.ConfigProto(
        device_count = {'GPU': 1}
    )
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    set_session(sess)

    solver = getattr(import_module('{}.dqn'.format(folder)), 'DQNSolver')
    dqn_solver = solver(observation_space, action_space)

    run = 0
    state = env.reset() #moved one loop up. otherwise player two wont be able to start if player one wins
    while True:
        state = env.soft_reset() #dirty workaround. creates an empty board without interfering with the turn counter --> makes loser able to start next round 
        run += 1
        if run % SAVE_EVERY_K_GAMES == 0 :
            print('Saving weights and starting evaluation...')
            dqn_solver.save()
            score, ties = evaluate.ai_vs_random(env, dqn_solver, eval_ctr=run,
                                                numberOfGames = NUMBER_OF_EVAL_GAMES,
                                                games_recorded_per_eval = GAMES_RECORDED_PER_EVAL)
            score_logger_random.add_score(score + ties, run) #logging ties as success

            eval_solver = getattr(import_module('{}.dqn'.format(EVAL_AI)), 'DQNSolver')
            eval_dqn_solver = eval_solver(observation_space, action_space)
            eval_dqn_solver.exploration_rate = 0

            ai1_win, ai2_win, tieCOunter = evaluate.ai_vs_ai(env, ai1=dqn_solver, ai1_name=folder,
                                                             ai2=eval_dqn_solver, ai2_name=EVAL_AI,
                                                             eval_ctr=run,
                                                             numberOfGames = NUMBER_OF_AI_EVAL_GAMES,
                                                             games_recorded_per_eval = GAMES_RECORDED_PER_EVAL)
            del eval_dqn_solver
            score_logger_ai.add_score(ai1_win + tieCOunter, run) #logging ties as success

        step = 0

        while True:
            step += 1
            player = env.getNextPlayer()

            if player == 1:
                action_player1 = dqn_solver.act(state, env)
                state_next, reward_player1, terminal, info = env.makeMove(player, action_player1, DEMO_MODE)
                state_copy = np.copy(state)
                state_next_copy = np.copy(state_next)
                if terminal:
                    dqn_solver.pop() # if player 1 wins, pop player 2's last move from and give it a negative reward
                    dqn_solver.remember(normalized_state, action_player2, reward_player1*-1, normalized_state_next, terminal)
                dqn_solver.remember(state, action_player1, reward_player1, state_next, terminal)
                state = state_next
            else:
                normalized_state = np.roll(state, 1, axis = -1)
                action_player2 = dqn_solver.act(normalized_state, env)
                state_next, reward_player2, terminal, info = env.makeMove(player, action_player2, DEMO_MODE)
                normalized_state_next = np.roll(state_next, 1, axis = -1)
                if terminal:
                    dqn_solver.pop() # if player 2 wins, pop player 1's last move from and give it a negative reward
                    dqn_solver.remember(state_copy, action_player1, reward_player2*-1, state_next_copy, terminal)
                dqn_solver.remember(normalized_state, action_player2, reward_player2, normalized_state_next, terminal)
                state = state_next

            if terminal:
#                if player == 1:
#                    player1won += 1
#                else:
#                    player2won += 1
#                try:
#                    winRatio = player1won/player2won
#                except ZeroDivisionError:
#                    winRatio = 0
#                print('Win ratio: {}'.format(winRatio)) #debug stuff
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", moves: " + str(step))
                break

        dqn_solver.experience_replay()
    y_i = r_i + 𝛾 * max(Q(next_state, action; 𝜃_target))
    Loss: (y_i - Q(state, action; 𝜃))^2
    Every C step, 𝜃_target <- 𝜃
"""
import os
import numpy as np
import tensorflow as tf
import random
from collections import deque
import deep_q_network as dqn
from point_and_click_env import Env
from score_logger import ScoreLogger
from typing import List

env = Env()
score_logger = ScoreLogger('mouse model', 1000, 100000)

# Constants defining our neural network
INPUT_SIZE = env.observation_space.shape[0]
OUTPUT_SIZE = env.action_space.n

DISCOUNT_RATE = 0.95
REPLAY_MEMORY = 100000
BATCH_SIZE = 32
TARGET_UPDATE_FREQUENCY = 1000
MAX_EPISODES = 4000000
SAVE_PERIOD = 10000
LOG_PERIOD = 10000
E_DECAY = 0.9998
E_MIN = 0.05
Example #16
0
def loader(name):
    Rs = np.empty(10).reshape(1, 10)
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    for pick in range(1, 11):
        env = gym.make('continuous-cartpole-v99')

        with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2:
            pi_adjust = pickle.load(inp2)

        score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
        run = 0
        avg_reward = 0
        while run != 101:
            run += 1
            if (run % 20 == 0):
                print('run:  ', run)
            state = env.reset()
            # print(state)
            # input()
            step = 0
            while True:
                step += 1
                #env.render()

                #TODO RUN PI ADJUST
                u_action = utils.policy(env, pilco, state, False)
                state_copy = state

                a = np.ndarray.tolist(state_copy)
                a.extend(np.ndarray.tolist(u_action))
                action = pi_adjust.predict(np.array(a).reshape(1, -1))
                action = action[0]
                if action[0] > 1:
                    action[0] = 1
                elif action[0] < -1:
                    action[0] = -1
                # TODO RUN PI ADJUST COMMENT THE NEXT LINE

                state_next, reward, terminal, info = env.step(action)
                reward = reward if not terminal else -reward
                state = state_next
                if terminal:
                    # print("Run: "  + ", score: " + str(step))
                    score_logger.add_score(step, run)
                    avg_reward = avg_reward + step
                    break
        avg_reward = avg_reward / run
        env.env.close()
        Rs[0][pick - 1] = avg_reward
    return (Rs)