Esempio n. 1
0
def looping(qt=None, epsilon=config.epsilon, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        end = False
        epsilon = epsilon * 0.9999
        while not end:
            current_state = cart.state
            action = choose_action(current_state, qt, epsilon)
            new_state, reward, end, _ = cart.step(action)
            if end:
                reward = -10
            update_qt_new(qt, current_state, reward, action, new_state)
            turn += 1
            if (visu):
                cart.render()
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", epsilon)
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Esempio n. 2
0
    def step(self, action):
        self.state = self.state + np.asarray(action)
        env = CartPoleEnv(self.state[0], self.state[1], self.state[2],
                          self.state[3], self.state[4])

        episode_count = len(self.action_record)
        model_diff = 0
        for i in range(episode_count):
            ob = env.reset()
            traj_state = []
            for j in range(len(self.action_record[i])):
                # The traj that done is better tricky here
                action = self.action_record[i][j]
                ob, reward, done, _ = env.step(action)
                traj_state.append(ob)
                if done:
                    break
            if not done:
                model_diff = model_diff + 1  # penalty for not done
            model_diff = model_diff + self._traj_diff(np.asarray(traj_state),
                                                      self.state_record[i])
        reward = -model_diff - self.status
        self.status = -model_diff
        done = False
        return np.array(self.state), reward, done, {}
Esempio n. 3
0
def loop(qt=None, epsilon=1, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    config.epsilon = epsilon
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        s = cart.state
        end = False
        epsilon_tmp = config.epsilon
        while not end:
            config.epsilon *= 0.97
            if (visu):
                cart.render()
            a = choose_action(s, qt)
            _, _, end, _ = cart.step(a)
            l_val = bellman_q(s, qt, dummy_cart(s), action=a)
            # print(l_val)
            update_qt(qt, s, a, l_val)
            s = cart.state
            turn += 1
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:",
              config.epsilon)
        config.epsilon = epsilon_tmp
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Esempio n. 4
0
# -*- coding: utf-8 -*-
"""Untitled0.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1lky0vjWP1y9GVXlg3VUukjP5nR9ry3SQ
"""

from cartpole import CartPoleEnv
import math
import numpy as np

env = CartPoleEnv()
env.reset()

def discretize(val,bounds,n_states):
    discrete_val = 0
    if val <= bounds[0]:
        discrete_val = 0
    elif val >= bounds[1]:
        discrete_val = n_states-1
    else:
        discrete_val = int(round((n_states-1)*((val-bounds[0])/(bounds[1]-bounds[0]))))
    return discrete_val

def discretize_state(vals,s_bounds,n_s):
    discrete_vals = []
    for i in range(len(n_s)):
        discrete_vals.append(discretize(vals[i],s_bounds[i],n_s[i]))
    return np.array(discrete_vals,dtype=np.int)
Esempio n. 5
0
def dummy_cart(s, cart=None):
    if cart == None:
        cart = CartPoleEnv()
    cart.reset()
    cart.state = s
    return cart
Esempio n. 6
0
        self.model.load_weights(filename)


if __name__ == '__main__':
    env = CartPoleEnv()
    agent = DQN(24, 24, env)
    # agent.load_model('bot.h5')

    episode_data = []
    score_data = []
    episode_data_ = []
    score_data_ = []

    # Learning
    for episode in range(5000):
        state = env.reset()
        state = np.reshape(state, [1, 4])  # reshape from [[a, b]] to [a, b]

        for t in range(1000):
            action, force = agent.act(state)

            # given Q values for each force and max arg of it, apply the corresponding force to the model
            if force == 2:
                env.force_mag = 6
            elif force == 3:
                env.force_mag = 8
            else:
                env.force_mag = 10

            # perform one step
            state_, reward, done, info = env.step(action)
Esempio n. 7
0
from cartpole import CartPoleEnv
import numpy as np
cart = CartPoleEnv()
cart.reset()

for _ in range(1000):

    # Calculate the Gradients

    # Update Thetas

    # Sample u trajectory

    # Apply u[0] to the actual system
    cart.step(10)  # Apply Some force

    # Update the New State in the Learner

    # Shift the Thetas

    # Simulate
    cart.render()

cart.close()
Esempio n. 8
0
def main():
    # Define dimensions of the networks
    
    meta_value_input_dim =  STATE_DIM + TASK_CONFIG_DIM # 7
    task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7

    # init meta value network with a task config network
    meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1)
    task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3)
    meta_value_network.cuda()
    task_config_network.cuda()

    if os.path.exists("meta_value_network_cartpole.pkl"):
        meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl"))
        print("load meta value network success")
    if os.path.exists("task_config_network_cartpole.pkl"):
        task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl"))
        print("load task config network success")

    meta_value_network_optim = torch.optim.Adam(meta_value_network.parameters(),lr=0.001)
    task_config_network_optim = torch.optim.Adam(task_config_network.parameters(),lr=0.001)

    # init a task generator for data fetching
    task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)]
    [task.reset() for task in task_list]

    task_lengths = [task.length for task in task_list]
    print("task length:",task_lengths)

    for episode in range(EPISODE):
        # ----------------- Training ------------------

        if (episode+1) % 10 ==0 :
            # renew the tasks
            task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)]
            task_lengths = [task.length for task in task_list]
            print("task length:",task_lengths)
            [task.reset() for task in task_list]

        # fetch pre data samples for task config network
        # [task_nums,sample_nums,x+y`]
        
        actor_network_list = [ActorNetwork(STATE_DIM,40,ACTION_DIM) for i in range(TASK_NUMS)]
        [actor_network.cuda() for actor_network in actor_network_list]
        actor_network_optim_list = [torch.optim.Adam(actor_network.parameters(),lr = 0.01) for actor_network in actor_network_list]

        # sample pre state,action,reward for task config
        pre_states = []
        pre_actions = []
        pre_rewards = []
        for i in range(TASK_NUMS):
            states,actions,rewards,_,_ = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS)
            pre_states.append(states)
            pre_actions.append(actions)
            pre_rewards.append(rewards)


        for step in range(STEP):

            for i in range(TASK_NUMS):
                # init task config [1, sample_nums,task_config] task_config size=3
                pre_data_samples = torch.cat((pre_states[i][-9:],pre_actions[i][-9:],torch.Tensor(pre_rewards[i])[-9:]),1).unsqueeze(0)
                task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3]

                states,actions,rewards,is_done,final_state = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS)
                final_r = 0
                if not is_done:
                    value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1)
                    final_r = meta_value_network(value_inputs).cpu().data.numpy()[0]

                # train actor network
                actor_network_optim_list[i].zero_grad()
                states_var = Variable(states).cuda()
                
                actions_var = Variable(actions).cuda()
                task_configs = task_config.repeat(1,len(rewards)).view(-1,3)
                log_softmax_actions = actor_network_list[i](states_var)
                vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach()
                # calculate qs
                qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda()

                advantages = qs - vs
                actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y)
                actor_network_loss.backward()
                torch.nn.utils.clip_grad_norm(actor_network_list[i].parameters(),0.5)

                actor_network_optim_list[i].step()

                # train value network

                meta_value_network_optim.zero_grad()

                target_values = qs
                values = meta_value_network(torch.cat((states_var,task_configs),1))
                criterion = nn.MSELoss()
                meta_value_network_loss = criterion(values,target_values)
                meta_value_network_loss.backward()
                torch.nn.utils.clip_grad_norm(meta_value_network.parameters(),0.5)

                meta_value_network_optim.step()                
                
                # train actor network
                
                pre_states[i] = states
                pre_actions[i] = actions
                pre_rewards[i] = rewards

                if (step + 1) % 100 == 0:
                    result = 0
                    test_task = CartPoleEnv(length = task_list[i].length)
                    for test_epi in range(10):
                        state = test_task.reset()
                        for test_step in range(200):
                            softmax_action = torch.exp(actor_network_list[i](Variable(torch.Tensor([state])).cuda()))
                            #print(softmax_action.data)
                            action = np.argmax(softmax_action.cpu().data.numpy()[0])
                            next_state,reward,done,_ = test_task.step(action)
                            result += reward
                            state = next_state
                            if done:
                                break
                    print("episode:",episode,"task:",i,"step:",step+1,"test result:",result/10.0)

        
        if (episode+1) % 10 == 0 :
            # Save meta value network
            torch.save(meta_value_network.state_dict(),"meta_value_network_cartpole.pkl")
            torch.save(task_config_network.state_dict(),"task_config_network_cartpole.pkl")
            print("save networks for episode:",episode)
from cartpole import CartPoleEnv

env = CartPoleEnv(length=1.0)

env.reset()

for step in range(1000):
    action = 0
    next_state, reward, done, _ = env.step(0)

    if done:
        print "done reward:", reward
        break
Esempio n. 10
0
def main():
    # Define dimensions of the networks

    meta_value_input_dim =  STATE_DIM + TASK_CONFIG_DIM # 7
    task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7

    # init meta value network with a task config network
    meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1)
    task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3)
    meta_value_network.cuda()
    task_config_network.cuda()

    if os.path.exists("meta_value_network_cartpole.pkl"):
        meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl"))
        print("load meta value network success")
    if os.path.exists("task_config_network_cartpole.pkl"):
        task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl"))
        print("load task config network success")


    task_lengths = np.linspace(L_MIN,L_MAX,TASK_NUMS)

    datas = []

    for task_length in task_lengths:

        data_i = {}
        data_i["task_length"] = task_length

        data_i_episode = {}
        for episode in range(EPISODE):
            task = CartPoleEnv(length = task_length)
            task.reset()

            data_i_episode["episode"] = episode

            # ----------------- Training ------------------

            # fetch pre data samples for task config network
            # [task_nums,sample_nums,x+y`]

            actor_network = ActorNetwork(STATE_DIM,40,ACTION_DIM)
            actor_network.cuda()
            actor_network_optim = torch.optim.Adam(actor_network.parameters(),lr = 0.01)
            '''
            if os.path.exists("actor_network.pkl"):
                actor_network.load_state_dict(torch.load("actor_network.pkl"))
                print("load actor_network success")
            '''
            # sample pre state,action,reward for task confi


            pre_states,pre_actions,pre_rewards,_,_ = roll_out(actor_network,task,SAMPLE_NUMS)


            test_results = []
            train_games = []
            for step in range(STEP):

                # init task config [1, sample_nums,task_config] task_config size=3
                pre_data_samples = torch.cat((pre_states[-9:],pre_actions[-9:],torch.Tensor(pre_rewards)[-9:]),1).unsqueeze(0)
                task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3]

                states,actions,rewards,is_done,final_state = roll_out(actor_network,task,SAMPLE_NUMS)
                final_r = 0
                if not is_done:
                    value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1)
                    final_r = meta_value_network(value_inputs).cpu().data.numpy()[0]
                # train actor network
                actor_network_optim.zero_grad()
                states_var = Variable(states).cuda()

                actions_var = Variable(actions).cuda()
                task_configs = task_config.repeat(1,len(rewards)).view(-1,3)
                log_softmax_actions = actor_network(states_var)
                vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach()
                # calculate qs
                qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda()

                advantages = qs - vs
                actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y)
                actor_network_loss.backward()
                torch.nn.utils.clip_grad_norm(actor_network.parameters(),0.5)

                actor_network_optim.step()

                pre_states = states
                pre_actions = actions
                pre_rewards = rewards

                # testing
                if (step + 1) % 10 == 0:
                    # testing
                    result = 0
                    test_task = CartPoleEnv(length = task.length)
                    for test_epi in range(10):
                        state = test_task.reset()
                        for test_step in range(200):
                            softmax_action = torch.exp(actor_network(Variable(torch.Tensor([state])).cuda()))
                            #print(softmax_action.data)
                            action = np.argmax(softmax_action.cpu().data.numpy()[0])
                            next_state,reward,done,_ = test_task.step(action)
                            result += reward
                            state = next_state
                            if done:
                                break
                    aver_result = result/10.0
                    test_results.append(aver_result)
                    train_games.append(task.episodes)
                    print("task length:",task_length,"episode:",episode,"step:",step+1,"result:",aver_result)

            data_i_episode["test_results"] = test_results
            data_i_episode["train_games"] = train_games
        data_i["results"] = data_i_episode
        datas.append(data_i)

    save_to_json('mvn_cartpole_test_100.json', datas)
Esempio n. 11
0
class CuteLearning():
    def __init__(self):
        self.plot_data = PlotData()
        self.env = CartPoleEnv()
        self.main_net = DQN()
        self.target_net = deepcopy(self.main_net)
        self.epsilon = config.epsilon
        self.eps_decay = 0.995
        self.visu = False
        self.visu_update = False#300
        self.visu_window = 5
        self.memory = Memory(memory_size = 30)
        self.batch_size = 5

    def reward_optimisation(self, state, end):
        reward = 0 if end else 1
        return reward
    
    def choose_action(self, q_values):
        if (random.random() > self.epsilon):
            return(np.argmax(q_values))
        else:
            return random.randint(0,1)

    def make_batch(self):
        batch = self.memory.get_batch(self.batch_size)
        states = []
        targets = []
        for s, a, r, ns, done in batch:
            states.append(s)
            q_values = self.target_net.predict(s).tolist()
            if done:
                q_values[a] = r
            else:
                q_values_next = self.target_net.predict(ns)
                q_values[a] = r + net_config.gamma * torch.max(q_values_next).item()
            targets.append(q_values)
        return states, targets

    def updato(self):
        states, targets = self.make_batch()
        self.main_net.update(states, targets)

    def upodato(self, state, reward, next_state, done):
        

    def learn(self, episodes = 10000, replay = False):
        episode = 0
        tmp = self.epsilon
        while (episode < episodes):
            done = False
            turn = 0
            state = self.env.reset()
            self.eps_decay = self.epsilon * self.eps_decay
            while (done == False):
                q_values = self.main_net.model(torch.Tensor(state)).tolist()
                action = self.choose_action(q_values)
                new_state, reward, done, _ = self.env.step(action)
                self.memory.add_data((state, action, reward, new_state, done))
                state = new_state
                turn += 1
                self.updato()


            print("turn:", turn)
            episode += 1
            if (episode % net_config.n_update == 0):
                self.target_net = deepcopy(self.main_net)
        self.epsilon = tmp
    def save(self):
        pass


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    Cutie = CuteLearning()
    Cutie.learn()
Esempio n. 12
0
class CuteLearning():
    def __init__(self):
        self.plot_data = PlotData()
        self.cart = CartPoleEnv()
        self.cart.reset()
        self.predi_net = DQN()
        self.updat_net = deepcopy(self.predi_net)
        self.turn = 0
        self.epidode = 0
        self.epsilon = config.epsilon
        self.eps_decay = 0.99
        self.visu = False
        self.visu_update = False  #300
        self.visu_window = 5
        self.consecutive_wins = 0
        self.best_consecutive_wins = 0
        self.last_save = 0
        self.memory = []

    def reward_optimisation(self, state, end):
        reward = -25 if end else 1
        if reward == 1:
            # Angle reward modification
            angle_r = 0.418 / 2
            reward += (((abs(angle_r - abs(state[2])) / angle_r) * 2) - 1) * 2
            # Position reward modification
            pos_r = 0.418 / 2
            reward += (((abs(pos_r - abs(state[0])) / pos_r) * 2) - 1) * 2
        return reward

    def learn(self):
        self.episode = 0
        n = 0
        while self.episode < 10:
            self.turn = 0
            end = False
            states = []
            targets = []
            while not end:
                # 1. Init
                state = self.cart.state
                # 2. Choose action
                q_values = self.predi_net.predict(state).tolist()
                a = choose_action_net(q_values, self.epsilon)
                # 3. Perform action
                next_state, _, end, _ = self.cart.step(a)
                # 4. Measure reward
                reward = self.reward_optimisation(next_state, end)
                q_values_next = self.predi_net.predict(next_state)
                # 5. Calcul Q-Values
                q_values[a] = reward + net_config.gamma * \
                    torch.max(q_values_next).item()

                self.turn += 1
                self.memory.append((state, a, next_state, reward, end))
                # self.updat_net.update(state, q_values)
                states.append(state)
                targets.append(q_values)
                if (self.turn % 20 and self.turn) or end:
                    self.updat_net.update(states, targets)
                    states = []
                    targets = []

                if self.turn >= 500:
                    end = True
                if self.visu:
                    self.cart.render()

            self.episode += 1
            self.replay(20)
            if self.episode % net_config.n_update == 0 and self.episode:
                print("Update")
                self.predi_net.model.load_state_dict(
                    self.updat_net.model.state_dict())
            self.end()
            n += 1

        self.save()
        self.cart.close()
        self.plot_data.clear()

    def replay(self, size):
        if size > len(self.memory):
            size = len(self.memory)
        data = random.sample(self.memory, size)
        states = []
        targets = []
        for state, action, next_state, reward, done in data:
            q_values = self.predi_net.predict(state)
            if done:
                q_values[action] = reward
            else:
                # The only difference between the simple replay is in this line
                # It ensures that next q values are predicted with the target network.
                q_values_next = self.predi_net.predict(next_state)
                q_values[action] = reward + net_config.gamma * torch.max(
                    q_values_next).item()
            states.append(state)
            targets.append(q_values)
        self.updat_net.update(state, q_values)

    def end(self):
        self.plot_data.new_data(self.turn)
        if self.turn > 195:
            self.consecutive_wins += 1
            if self.best_consecutive_wins < self.consecutive_wins:
                self.best_consecutive_wins = self.consecutive_wins
            if self.consecutive_wins > 200:
                self.save()
                print(("WIN IN " + str(self.episode) + " EPISODES\n") * 100)
        else:
            self.consecutive_wins = 0
            if self.last_save * 1.2 < self.best_consecutive_wins and 50 <= self.best_consecutive_wins:
                self.save()
                self.last_save = self.best_consecutive_wins
        print("Episode: ", self.episode, "\tTurn:", self.turn, "\tEpsilon:",
              self.epsilon, "\tWins: ", "{:3}".format(self.consecutive_wins),
              "/", self.best_consecutive_wins)
        self.turn = 0
        self.cart.reset()
        if self.episode % config.graph_update == 0 and self.episode != 0:
            self.plot_data.graph()
        if self.visu_update:
            if self.episode % self.visu_update == 0:
                self.visu = True
            if self.episode % self.visu_update == self.visu_window:
                self.visu = False
                self.cart.close()
        self.epsilon = max(self.epsilon * self.eps_decay, 0.01)

    def save(self):
        pass
Esempio n. 13
0
import gym
from cartpole import CartPoleEnv

env = CartPoleEnv()

observation = env.reset()

total_reward = 0

print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

for t in range(200):
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    total_reward += reward
    if done:
        break

env.close()

print(total_reward)
Esempio n. 14
0
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    outdir = '/tmp/random-agent-results'
    #env = wrappers.Monitor(env, directory=outdir, force=True)
    env.seed(0)
    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    action_record = []
    state_record = []
    for i in range(episode_count):
        ob = env.reset()
        #action = agent.act(ob, reward, done)
        traj_action = []
        traj_state = []
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            traj_action.append(action)
            traj_state.append(ob)
            print(done)
            #env.render()
            if done:
                break
        action_record.append(traj_action)
        state_record.append(traj_state)
    action_record = np.asarray(action_record)