Esempio n. 1
0
    def step(self, action):
        self.state = self.state + np.asarray(action)
        env = CartPoleEnv(self.state[0], self.state[1], self.state[2],
                          self.state[3], self.state[4])

        episode_count = len(self.action_record)
        model_diff = 0
        for i in range(episode_count):
            ob = env.reset()
            traj_state = []
            for j in range(len(self.action_record[i])):
                # The traj that done is better tricky here
                action = self.action_record[i][j]
                ob, reward, done, _ = env.step(action)
                traj_state.append(ob)
                if done:
                    break
            if not done:
                model_diff = model_diff + 1  # penalty for not done
            model_diff = model_diff + self._traj_diff(np.asarray(traj_state),
                                                      self.state_record[i])
        reward = -model_diff - self.status
        self.status = -model_diff
        done = False
        return np.array(self.state), reward, done, {}
Esempio n. 2
0
def looping(qt=None, epsilon=config.epsilon, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        end = False
        epsilon = epsilon * 0.9999
        while not end:
            current_state = cart.state
            action = choose_action(current_state, qt, epsilon)
            new_state, reward, end, _ = cart.step(action)
            if end:
                reward = -10
            update_qt_new(qt, current_state, reward, action, new_state)
            turn += 1
            if (visu):
                cart.render()
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", epsilon)
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Esempio n. 3
0
def loop(qt=None, epsilon=1, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    config.epsilon = epsilon
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        s = cart.state
        end = False
        epsilon_tmp = config.epsilon
        while not end:
            config.epsilon *= 0.97
            if (visu):
                cart.render()
            a = choose_action(s, qt)
            _, _, end, _ = cart.step(a)
            l_val = bellman_q(s, qt, dummy_cart(s), action=a)
            # print(l_val)
            update_qt(qt, s, a, l_val)
            s = cart.state
            turn += 1
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:",
              config.epsilon)
        config.epsilon = epsilon_tmp
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Esempio n. 4
0
    
V = np.zeros(n_s)

    
for i in range(episodes):
    
    print("episode=",i)
            
    obs = env.reset()
    s = discretize_state(obs,s_bounds,n_s)   
        
    finished = False
    time_step=0
    

    #dlugie epizody przerywamy po 200 krokach
    while not finished and not time_step==200:       
            
        #polityka stochastyczna - prawdopodobienstwo 0.5 dla obu akcji (w lewo, w prawo)
        action = np.random.randint(0,2)
        obs, reward, finished, info = env.step(action)
        state_new = discretize_state(obs,s_bounds,n_s)
        V[s] = V[s] + alpha * (reward + gamma*V[state_new] - V[s])
        s = state_new
        
        time_step+=1
      
    
    
        #DO UZUPEŁNIENIA
print(V)
Esempio n. 5
0
        state = env.reset()
        state = np.reshape(state, [1, 4])  # reshape from [[a, b]] to [a, b]

        for t in range(1000):
            action, force = agent.act(state)

            # given Q values for each force and max arg of it, apply the corresponding force to the model
            if force == 2:
                env.force_mag = 6
            elif force == 3:
                env.force_mag = 8
            else:
                env.force_mag = 10

            # perform one step
            state_, reward, done, info = env.step(action)
            state_ = np.reshape(state_, [1, 4])

            # reward function
            reward = reward if not done else -20  # additional penalty for a loss
            reward -= 1.0 * abs(
                state_[0]
                [0])  # penalty for moving too far away, increasing linearly

            # archive the step and perform fitting to the model
            agent.remember(state, reward, action, state_, done)
            # agent.replay()

            # state = next state
            state = state_
Esempio n. 6
0
from cartpole import CartPoleEnv
import numpy as np
cart = CartPoleEnv()
cart.reset()

for _ in range(1000):

    # Calculate the Gradients

    # Update Thetas

    # Sample u trajectory

    # Apply u[0] to the actual system
    cart.step(10)  # Apply Some force

    # Update the New State in the Learner

    # Shift the Thetas

    # Simulate
    cart.render()

cart.close()
Esempio n. 7
0
def main():
    # Define dimensions of the networks
    
    meta_value_input_dim =  STATE_DIM + TASK_CONFIG_DIM # 7
    task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7

    # init meta value network with a task config network
    meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1)
    task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3)
    meta_value_network.cuda()
    task_config_network.cuda()

    if os.path.exists("meta_value_network_cartpole.pkl"):
        meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl"))
        print("load meta value network success")
    if os.path.exists("task_config_network_cartpole.pkl"):
        task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl"))
        print("load task config network success")

    meta_value_network_optim = torch.optim.Adam(meta_value_network.parameters(),lr=0.001)
    task_config_network_optim = torch.optim.Adam(task_config_network.parameters(),lr=0.001)

    # init a task generator for data fetching
    task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)]
    [task.reset() for task in task_list]

    task_lengths = [task.length for task in task_list]
    print("task length:",task_lengths)

    for episode in range(EPISODE):
        # ----------------- Training ------------------

        if (episode+1) % 10 ==0 :
            # renew the tasks
            task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)]
            task_lengths = [task.length for task in task_list]
            print("task length:",task_lengths)
            [task.reset() for task in task_list]

        # fetch pre data samples for task config network
        # [task_nums,sample_nums,x+y`]
        
        actor_network_list = [ActorNetwork(STATE_DIM,40,ACTION_DIM) for i in range(TASK_NUMS)]
        [actor_network.cuda() for actor_network in actor_network_list]
        actor_network_optim_list = [torch.optim.Adam(actor_network.parameters(),lr = 0.01) for actor_network in actor_network_list]

        # sample pre state,action,reward for task config
        pre_states = []
        pre_actions = []
        pre_rewards = []
        for i in range(TASK_NUMS):
            states,actions,rewards,_,_ = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS)
            pre_states.append(states)
            pre_actions.append(actions)
            pre_rewards.append(rewards)


        for step in range(STEP):

            for i in range(TASK_NUMS):
                # init task config [1, sample_nums,task_config] task_config size=3
                pre_data_samples = torch.cat((pre_states[i][-9:],pre_actions[i][-9:],torch.Tensor(pre_rewards[i])[-9:]),1).unsqueeze(0)
                task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3]

                states,actions,rewards,is_done,final_state = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS)
                final_r = 0
                if not is_done:
                    value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1)
                    final_r = meta_value_network(value_inputs).cpu().data.numpy()[0]

                # train actor network
                actor_network_optim_list[i].zero_grad()
                states_var = Variable(states).cuda()
                
                actions_var = Variable(actions).cuda()
                task_configs = task_config.repeat(1,len(rewards)).view(-1,3)
                log_softmax_actions = actor_network_list[i](states_var)
                vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach()
                # calculate qs
                qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda()

                advantages = qs - vs
                actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y)
                actor_network_loss.backward()
                torch.nn.utils.clip_grad_norm(actor_network_list[i].parameters(),0.5)

                actor_network_optim_list[i].step()

                # train value network

                meta_value_network_optim.zero_grad()

                target_values = qs
                values = meta_value_network(torch.cat((states_var,task_configs),1))
                criterion = nn.MSELoss()
                meta_value_network_loss = criterion(values,target_values)
                meta_value_network_loss.backward()
                torch.nn.utils.clip_grad_norm(meta_value_network.parameters(),0.5)

                meta_value_network_optim.step()                
                
                # train actor network
                
                pre_states[i] = states
                pre_actions[i] = actions
                pre_rewards[i] = rewards

                if (step + 1) % 100 == 0:
                    result = 0
                    test_task = CartPoleEnv(length = task_list[i].length)
                    for test_epi in range(10):
                        state = test_task.reset()
                        for test_step in range(200):
                            softmax_action = torch.exp(actor_network_list[i](Variable(torch.Tensor([state])).cuda()))
                            #print(softmax_action.data)
                            action = np.argmax(softmax_action.cpu().data.numpy()[0])
                            next_state,reward,done,_ = test_task.step(action)
                            result += reward
                            state = next_state
                            if done:
                                break
                    print("episode:",episode,"task:",i,"step:",step+1,"test result:",result/10.0)

        
        if (episode+1) % 10 == 0 :
            # Save meta value network
            torch.save(meta_value_network.state_dict(),"meta_value_network_cartpole.pkl")
            torch.save(task_config_network.state_dict(),"task_config_network_cartpole.pkl")
            print("save networks for episode:",episode)
from cartpole import CartPoleEnv

env = CartPoleEnv(length=1.0)

env.reset()

for step in range(1000):
    action = 0
    next_state, reward, done, _ = env.step(0)

    if done:
        print "done reward:", reward
        break
Esempio n. 9
0
def main():
    # Define dimensions of the networks

    meta_value_input_dim =  STATE_DIM + TASK_CONFIG_DIM # 7
    task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7

    # init meta value network with a task config network
    meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1)
    task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3)
    meta_value_network.cuda()
    task_config_network.cuda()

    if os.path.exists("meta_value_network_cartpole.pkl"):
        meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl"))
        print("load meta value network success")
    if os.path.exists("task_config_network_cartpole.pkl"):
        task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl"))
        print("load task config network success")


    task_lengths = np.linspace(L_MIN,L_MAX,TASK_NUMS)

    datas = []

    for task_length in task_lengths:

        data_i = {}
        data_i["task_length"] = task_length

        data_i_episode = {}
        for episode in range(EPISODE):
            task = CartPoleEnv(length = task_length)
            task.reset()

            data_i_episode["episode"] = episode

            # ----------------- Training ------------------

            # fetch pre data samples for task config network
            # [task_nums,sample_nums,x+y`]

            actor_network = ActorNetwork(STATE_DIM,40,ACTION_DIM)
            actor_network.cuda()
            actor_network_optim = torch.optim.Adam(actor_network.parameters(),lr = 0.01)
            '''
            if os.path.exists("actor_network.pkl"):
                actor_network.load_state_dict(torch.load("actor_network.pkl"))
                print("load actor_network success")
            '''
            # sample pre state,action,reward for task confi


            pre_states,pre_actions,pre_rewards,_,_ = roll_out(actor_network,task,SAMPLE_NUMS)


            test_results = []
            train_games = []
            for step in range(STEP):

                # init task config [1, sample_nums,task_config] task_config size=3
                pre_data_samples = torch.cat((pre_states[-9:],pre_actions[-9:],torch.Tensor(pre_rewards)[-9:]),1).unsqueeze(0)
                task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3]

                states,actions,rewards,is_done,final_state = roll_out(actor_network,task,SAMPLE_NUMS)
                final_r = 0
                if not is_done:
                    value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1)
                    final_r = meta_value_network(value_inputs).cpu().data.numpy()[0]
                # train actor network
                actor_network_optim.zero_grad()
                states_var = Variable(states).cuda()

                actions_var = Variable(actions).cuda()
                task_configs = task_config.repeat(1,len(rewards)).view(-1,3)
                log_softmax_actions = actor_network(states_var)
                vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach()
                # calculate qs
                qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda()

                advantages = qs - vs
                actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y)
                actor_network_loss.backward()
                torch.nn.utils.clip_grad_norm(actor_network.parameters(),0.5)

                actor_network_optim.step()

                pre_states = states
                pre_actions = actions
                pre_rewards = rewards

                # testing
                if (step + 1) % 10 == 0:
                    # testing
                    result = 0
                    test_task = CartPoleEnv(length = task.length)
                    for test_epi in range(10):
                        state = test_task.reset()
                        for test_step in range(200):
                            softmax_action = torch.exp(actor_network(Variable(torch.Tensor([state])).cuda()))
                            #print(softmax_action.data)
                            action = np.argmax(softmax_action.cpu().data.numpy()[0])
                            next_state,reward,done,_ = test_task.step(action)
                            result += reward
                            state = next_state
                            if done:
                                break
                    aver_result = result/10.0
                    test_results.append(aver_result)
                    train_games.append(task.episodes)
                    print("task length:",task_length,"episode:",episode,"step:",step+1,"result:",aver_result)

            data_i_episode["test_results"] = test_results
            data_i_episode["train_games"] = train_games
        data_i["results"] = data_i_episode
        datas.append(data_i)

    save_to_json('mvn_cartpole_test_100.json', datas)
Esempio n. 10
0
for i in range(episodes):

    print("episode=", i)

    obs = env.reset()
    s = discretize_state(obs, s_bounds, n_s)

    finished = False
    time_step = 0

    #dlugie epizody przerywamy po 200 krokach
    while not finished and not time_step == 200:

        #polityka stochastyczna - prawdopodobienstwo 0.5 dla obu akcji (w lewo, w prawo)
        action = np.random.randint(0, 2)

        #DO UZUPEŁNIENIA
        obs, reward, done, info = env.step(action)
        s_ = discretize_state(obs, s_bounds, n_s)

        V[s] = V[s] + alpha * (reward + gamma * V[s_] - V[s])
        s = s_  #s = s'
        if done == True:
            finished = True
            break
        time_step += 1

print(V)

# In[ ]:
Esempio n. 11
0
class CuteLearning():
    def __init__(self):
        self.plot_data = PlotData()
        self.env = CartPoleEnv()
        self.main_net = DQN()
        self.target_net = deepcopy(self.main_net)
        self.epsilon = config.epsilon
        self.eps_decay = 0.995
        self.visu = False
        self.visu_update = False#300
        self.visu_window = 5
        self.memory = Memory(memory_size = 30)
        self.batch_size = 5

    def reward_optimisation(self, state, end):
        reward = 0 if end else 1
        return reward
    
    def choose_action(self, q_values):
        if (random.random() > self.epsilon):
            return(np.argmax(q_values))
        else:
            return random.randint(0,1)

    def make_batch(self):
        batch = self.memory.get_batch(self.batch_size)
        states = []
        targets = []
        for s, a, r, ns, done in batch:
            states.append(s)
            q_values = self.target_net.predict(s).tolist()
            if done:
                q_values[a] = r
            else:
                q_values_next = self.target_net.predict(ns)
                q_values[a] = r + net_config.gamma * torch.max(q_values_next).item()
            targets.append(q_values)
        return states, targets

    def updato(self):
        states, targets = self.make_batch()
        self.main_net.update(states, targets)

    def upodato(self, state, reward, next_state, done):
        

    def learn(self, episodes = 10000, replay = False):
        episode = 0
        tmp = self.epsilon
        while (episode < episodes):
            done = False
            turn = 0
            state = self.env.reset()
            self.eps_decay = self.epsilon * self.eps_decay
            while (done == False):
                q_values = self.main_net.model(torch.Tensor(state)).tolist()
                action = self.choose_action(q_values)
                new_state, reward, done, _ = self.env.step(action)
                self.memory.add_data((state, action, reward, new_state, done))
                state = new_state
                turn += 1
                self.updato()


            print("turn:", turn)
            episode += 1
            if (episode % net_config.n_update == 0):
                self.target_net = deepcopy(self.main_net)
        self.epsilon = tmp
    def save(self):
        pass


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    Cutie = CuteLearning()
    Cutie.learn()
Esempio n. 12
0
class CuteLearning():
    def __init__(self):
        self.plot_data = PlotData()
        self.cart = CartPoleEnv()
        self.cart.reset()
        self.predi_net = DQN()
        self.updat_net = deepcopy(self.predi_net)
        self.turn = 0
        self.epidode = 0
        self.epsilon = config.epsilon
        self.eps_decay = 0.99
        self.visu = False
        self.visu_update = False  #300
        self.visu_window = 5
        self.consecutive_wins = 0
        self.best_consecutive_wins = 0
        self.last_save = 0
        self.memory = []

    def reward_optimisation(self, state, end):
        reward = -25 if end else 1
        if reward == 1:
            # Angle reward modification
            angle_r = 0.418 / 2
            reward += (((abs(angle_r - abs(state[2])) / angle_r) * 2) - 1) * 2
            # Position reward modification
            pos_r = 0.418 / 2
            reward += (((abs(pos_r - abs(state[0])) / pos_r) * 2) - 1) * 2
        return reward

    def learn(self):
        self.episode = 0
        n = 0
        while self.episode < 10:
            self.turn = 0
            end = False
            states = []
            targets = []
            while not end:
                # 1. Init
                state = self.cart.state
                # 2. Choose action
                q_values = self.predi_net.predict(state).tolist()
                a = choose_action_net(q_values, self.epsilon)
                # 3. Perform action
                next_state, _, end, _ = self.cart.step(a)
                # 4. Measure reward
                reward = self.reward_optimisation(next_state, end)
                q_values_next = self.predi_net.predict(next_state)
                # 5. Calcul Q-Values
                q_values[a] = reward + net_config.gamma * \
                    torch.max(q_values_next).item()

                self.turn += 1
                self.memory.append((state, a, next_state, reward, end))
                # self.updat_net.update(state, q_values)
                states.append(state)
                targets.append(q_values)
                if (self.turn % 20 and self.turn) or end:
                    self.updat_net.update(states, targets)
                    states = []
                    targets = []

                if self.turn >= 500:
                    end = True
                if self.visu:
                    self.cart.render()

            self.episode += 1
            self.replay(20)
            if self.episode % net_config.n_update == 0 and self.episode:
                print("Update")
                self.predi_net.model.load_state_dict(
                    self.updat_net.model.state_dict())
            self.end()
            n += 1

        self.save()
        self.cart.close()
        self.plot_data.clear()

    def replay(self, size):
        if size > len(self.memory):
            size = len(self.memory)
        data = random.sample(self.memory, size)
        states = []
        targets = []
        for state, action, next_state, reward, done in data:
            q_values = self.predi_net.predict(state)
            if done:
                q_values[action] = reward
            else:
                # The only difference between the simple replay is in this line
                # It ensures that next q values are predicted with the target network.
                q_values_next = self.predi_net.predict(next_state)
                q_values[action] = reward + net_config.gamma * torch.max(
                    q_values_next).item()
            states.append(state)
            targets.append(q_values)
        self.updat_net.update(state, q_values)

    def end(self):
        self.plot_data.new_data(self.turn)
        if self.turn > 195:
            self.consecutive_wins += 1
            if self.best_consecutive_wins < self.consecutive_wins:
                self.best_consecutive_wins = self.consecutive_wins
            if self.consecutive_wins > 200:
                self.save()
                print(("WIN IN " + str(self.episode) + " EPISODES\n") * 100)
        else:
            self.consecutive_wins = 0
            if self.last_save * 1.2 < self.best_consecutive_wins and 50 <= self.best_consecutive_wins:
                self.save()
                self.last_save = self.best_consecutive_wins
        print("Episode: ", self.episode, "\tTurn:", self.turn, "\tEpsilon:",
              self.epsilon, "\tWins: ", "{:3}".format(self.consecutive_wins),
              "/", self.best_consecutive_wins)
        self.turn = 0
        self.cart.reset()
        if self.episode % config.graph_update == 0 and self.episode != 0:
            self.plot_data.graph()
        if self.visu_update:
            if self.episode % self.visu_update == 0:
                self.visu = True
            if self.episode % self.visu_update == self.visu_window:
                self.visu = False
                self.cart.close()
        self.epsilon = max(self.epsilon * self.eps_decay, 0.01)

    def save(self):
        pass
Esempio n. 13
0
import gym
from cartpole import CartPoleEnv

env = CartPoleEnv()

observation = env.reset()

total_reward = 0

print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

for t in range(200):
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    total_reward += reward
    if done:
        break

env.close()

print(total_reward)
Esempio n. 14
0
    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    action_record = []
    state_record = []
    for i in range(episode_count):
        ob = env.reset()
        #action = agent.act(ob, reward, done)
        traj_action = []
        traj_state = []
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            traj_action.append(action)
            traj_state.append(ob)
            print(done)
            #env.render()
            if done:
                break
        action_record.append(traj_action)
        state_record.append(traj_state)
    action_record = np.asarray(action_record)
    state_record = np.asarray(state_record)
    record = {'actions': action_record, 'states':state_record}

    record_file = open('record.pickle', 'wb')
    pickle.dump(record, record_file)
            # Note there's no env.render() here. But the environment still can open window and