def step(self, action): self.state = self.state + np.asarray(action) env = CartPoleEnv(self.state[0], self.state[1], self.state[2], self.state[3], self.state[4]) episode_count = len(self.action_record) model_diff = 0 for i in range(episode_count): ob = env.reset() traj_state = [] for j in range(len(self.action_record[i])): # The traj that done is better tricky here action = self.action_record[i][j] ob, reward, done, _ = env.step(action) traj_state.append(ob) if done: break if not done: model_diff = model_diff + 1 # penalty for not done model_diff = model_diff + self._traj_diff(np.asarray(traj_state), self.state_record[i]) reward = -model_diff - self.status self.status = -model_diff done = False return np.array(self.state), reward, done, {}
def looping(qt=None, epsilon=config.epsilon, visu=False): plt.ion() cart = CartPoleEnv() data = [] data_rm = [] if (qt is None): qt = initialize_Qtable() for episode in range(config.episodes): cart.reset() turn = 0 end = False epsilon = epsilon * 0.9999 while not end: current_state = cart.state action = choose_action(current_state, qt, epsilon) new_state, reward, end, _ = cart.step(action) if end: reward = -10 update_qt_new(qt, current_state, reward, action, new_state) turn += 1 if (visu): cart.render() data.append(turn) data_rm.append(np.mean(data[-100:])) print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", epsilon) if episode % config.graph_update == 0 and episode != 0: graph(data, data_rm) # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")): # break cart.close() return (data, qt)
def loop(qt=None, epsilon=1, visu=False): plt.ion() cart = CartPoleEnv() data = [] data_rm = [] config.epsilon = epsilon if (qt is None): qt = initialize_Qtable() for episode in range(config.episodes): cart.reset() turn = 0 s = cart.state end = False epsilon_tmp = config.epsilon while not end: config.epsilon *= 0.97 if (visu): cart.render() a = choose_action(s, qt) _, _, end, _ = cart.step(a) l_val = bellman_q(s, qt, dummy_cart(s), action=a) # print(l_val) update_qt(qt, s, a, l_val) s = cart.state turn += 1 data.append(turn) data_rm.append(np.mean(data[-100:])) print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", config.epsilon) config.epsilon = epsilon_tmp if episode % config.graph_update == 0 and episode != 0: graph(data, data_rm) # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")): # break cart.close() return (data, qt)
V = np.zeros(n_s) for i in range(episodes): print("episode=",i) obs = env.reset() s = discretize_state(obs,s_bounds,n_s) finished = False time_step=0 #dlugie epizody przerywamy po 200 krokach while not finished and not time_step==200: #polityka stochastyczna - prawdopodobienstwo 0.5 dla obu akcji (w lewo, w prawo) action = np.random.randint(0,2) obs, reward, finished, info = env.step(action) state_new = discretize_state(obs,s_bounds,n_s) V[s] = V[s] + alpha * (reward + gamma*V[state_new] - V[s]) s = state_new time_step+=1 #DO UZUPEŁNIENIA print(V)
state = env.reset() state = np.reshape(state, [1, 4]) # reshape from [[a, b]] to [a, b] for t in range(1000): action, force = agent.act(state) # given Q values for each force and max arg of it, apply the corresponding force to the model if force == 2: env.force_mag = 6 elif force == 3: env.force_mag = 8 else: env.force_mag = 10 # perform one step state_, reward, done, info = env.step(action) state_ = np.reshape(state_, [1, 4]) # reward function reward = reward if not done else -20 # additional penalty for a loss reward -= 1.0 * abs( state_[0] [0]) # penalty for moving too far away, increasing linearly # archive the step and perform fitting to the model agent.remember(state, reward, action, state_, done) # agent.replay() # state = next state state = state_
from cartpole import CartPoleEnv import numpy as np cart = CartPoleEnv() cart.reset() for _ in range(1000): # Calculate the Gradients # Update Thetas # Sample u trajectory # Apply u[0] to the actual system cart.step(10) # Apply Some force # Update the New State in the Learner # Shift the Thetas # Simulate cart.render() cart.close()
def main(): # Define dimensions of the networks meta_value_input_dim = STATE_DIM + TASK_CONFIG_DIM # 7 task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7 # init meta value network with a task config network meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1) task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3) meta_value_network.cuda() task_config_network.cuda() if os.path.exists("meta_value_network_cartpole.pkl"): meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl")) print("load meta value network success") if os.path.exists("task_config_network_cartpole.pkl"): task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl")) print("load task config network success") meta_value_network_optim = torch.optim.Adam(meta_value_network.parameters(),lr=0.001) task_config_network_optim = torch.optim.Adam(task_config_network.parameters(),lr=0.001) # init a task generator for data fetching task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)] [task.reset() for task in task_list] task_lengths = [task.length for task in task_list] print("task length:",task_lengths) for episode in range(EPISODE): # ----------------- Training ------------------ if (episode+1) % 10 ==0 : # renew the tasks task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)] task_lengths = [task.length for task in task_list] print("task length:",task_lengths) [task.reset() for task in task_list] # fetch pre data samples for task config network # [task_nums,sample_nums,x+y`] actor_network_list = [ActorNetwork(STATE_DIM,40,ACTION_DIM) for i in range(TASK_NUMS)] [actor_network.cuda() for actor_network in actor_network_list] actor_network_optim_list = [torch.optim.Adam(actor_network.parameters(),lr = 0.01) for actor_network in actor_network_list] # sample pre state,action,reward for task config pre_states = [] pre_actions = [] pre_rewards = [] for i in range(TASK_NUMS): states,actions,rewards,_,_ = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS) pre_states.append(states) pre_actions.append(actions) pre_rewards.append(rewards) for step in range(STEP): for i in range(TASK_NUMS): # init task config [1, sample_nums,task_config] task_config size=3 pre_data_samples = torch.cat((pre_states[i][-9:],pre_actions[i][-9:],torch.Tensor(pre_rewards[i])[-9:]),1).unsqueeze(0) task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3] states,actions,rewards,is_done,final_state = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS) final_r = 0 if not is_done: value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1) final_r = meta_value_network(value_inputs).cpu().data.numpy()[0] # train actor network actor_network_optim_list[i].zero_grad() states_var = Variable(states).cuda() actions_var = Variable(actions).cuda() task_configs = task_config.repeat(1,len(rewards)).view(-1,3) log_softmax_actions = actor_network_list[i](states_var) vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach() # calculate qs qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda() advantages = qs - vs actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y) actor_network_loss.backward() torch.nn.utils.clip_grad_norm(actor_network_list[i].parameters(),0.5) actor_network_optim_list[i].step() # train value network meta_value_network_optim.zero_grad() target_values = qs values = meta_value_network(torch.cat((states_var,task_configs),1)) criterion = nn.MSELoss() meta_value_network_loss = criterion(values,target_values) meta_value_network_loss.backward() torch.nn.utils.clip_grad_norm(meta_value_network.parameters(),0.5) meta_value_network_optim.step() # train actor network pre_states[i] = states pre_actions[i] = actions pre_rewards[i] = rewards if (step + 1) % 100 == 0: result = 0 test_task = CartPoleEnv(length = task_list[i].length) for test_epi in range(10): state = test_task.reset() for test_step in range(200): softmax_action = torch.exp(actor_network_list[i](Variable(torch.Tensor([state])).cuda())) #print(softmax_action.data) action = np.argmax(softmax_action.cpu().data.numpy()[0]) next_state,reward,done,_ = test_task.step(action) result += reward state = next_state if done: break print("episode:",episode,"task:",i,"step:",step+1,"test result:",result/10.0) if (episode+1) % 10 == 0 : # Save meta value network torch.save(meta_value_network.state_dict(),"meta_value_network_cartpole.pkl") torch.save(task_config_network.state_dict(),"task_config_network_cartpole.pkl") print("save networks for episode:",episode)
from cartpole import CartPoleEnv env = CartPoleEnv(length=1.0) env.reset() for step in range(1000): action = 0 next_state, reward, done, _ = env.step(0) if done: print "done reward:", reward break
def main(): # Define dimensions of the networks meta_value_input_dim = STATE_DIM + TASK_CONFIG_DIM # 7 task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7 # init meta value network with a task config network meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1) task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3) meta_value_network.cuda() task_config_network.cuda() if os.path.exists("meta_value_network_cartpole.pkl"): meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl")) print("load meta value network success") if os.path.exists("task_config_network_cartpole.pkl"): task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl")) print("load task config network success") task_lengths = np.linspace(L_MIN,L_MAX,TASK_NUMS) datas = [] for task_length in task_lengths: data_i = {} data_i["task_length"] = task_length data_i_episode = {} for episode in range(EPISODE): task = CartPoleEnv(length = task_length) task.reset() data_i_episode["episode"] = episode # ----------------- Training ------------------ # fetch pre data samples for task config network # [task_nums,sample_nums,x+y`] actor_network = ActorNetwork(STATE_DIM,40,ACTION_DIM) actor_network.cuda() actor_network_optim = torch.optim.Adam(actor_network.parameters(),lr = 0.01) ''' if os.path.exists("actor_network.pkl"): actor_network.load_state_dict(torch.load("actor_network.pkl")) print("load actor_network success") ''' # sample pre state,action,reward for task confi pre_states,pre_actions,pre_rewards,_,_ = roll_out(actor_network,task,SAMPLE_NUMS) test_results = [] train_games = [] for step in range(STEP): # init task config [1, sample_nums,task_config] task_config size=3 pre_data_samples = torch.cat((pre_states[-9:],pre_actions[-9:],torch.Tensor(pre_rewards)[-9:]),1).unsqueeze(0) task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3] states,actions,rewards,is_done,final_state = roll_out(actor_network,task,SAMPLE_NUMS) final_r = 0 if not is_done: value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1) final_r = meta_value_network(value_inputs).cpu().data.numpy()[0] # train actor network actor_network_optim.zero_grad() states_var = Variable(states).cuda() actions_var = Variable(actions).cuda() task_configs = task_config.repeat(1,len(rewards)).view(-1,3) log_softmax_actions = actor_network(states_var) vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach() # calculate qs qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda() advantages = qs - vs actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y) actor_network_loss.backward() torch.nn.utils.clip_grad_norm(actor_network.parameters(),0.5) actor_network_optim.step() pre_states = states pre_actions = actions pre_rewards = rewards # testing if (step + 1) % 10 == 0: # testing result = 0 test_task = CartPoleEnv(length = task.length) for test_epi in range(10): state = test_task.reset() for test_step in range(200): softmax_action = torch.exp(actor_network(Variable(torch.Tensor([state])).cuda())) #print(softmax_action.data) action = np.argmax(softmax_action.cpu().data.numpy()[0]) next_state,reward,done,_ = test_task.step(action) result += reward state = next_state if done: break aver_result = result/10.0 test_results.append(aver_result) train_games.append(task.episodes) print("task length:",task_length,"episode:",episode,"step:",step+1,"result:",aver_result) data_i_episode["test_results"] = test_results data_i_episode["train_games"] = train_games data_i["results"] = data_i_episode datas.append(data_i) save_to_json('mvn_cartpole_test_100.json', datas)
for i in range(episodes): print("episode=", i) obs = env.reset() s = discretize_state(obs, s_bounds, n_s) finished = False time_step = 0 #dlugie epizody przerywamy po 200 krokach while not finished and not time_step == 200: #polityka stochastyczna - prawdopodobienstwo 0.5 dla obu akcji (w lewo, w prawo) action = np.random.randint(0, 2) #DO UZUPEŁNIENIA obs, reward, done, info = env.step(action) s_ = discretize_state(obs, s_bounds, n_s) V[s] = V[s] + alpha * (reward + gamma * V[s_] - V[s]) s = s_ #s = s' if done == True: finished = True break time_step += 1 print(V) # In[ ]:
class CuteLearning(): def __init__(self): self.plot_data = PlotData() self.env = CartPoleEnv() self.main_net = DQN() self.target_net = deepcopy(self.main_net) self.epsilon = config.epsilon self.eps_decay = 0.995 self.visu = False self.visu_update = False#300 self.visu_window = 5 self.memory = Memory(memory_size = 30) self.batch_size = 5 def reward_optimisation(self, state, end): reward = 0 if end else 1 return reward def choose_action(self, q_values): if (random.random() > self.epsilon): return(np.argmax(q_values)) else: return random.randint(0,1) def make_batch(self): batch = self.memory.get_batch(self.batch_size) states = [] targets = [] for s, a, r, ns, done in batch: states.append(s) q_values = self.target_net.predict(s).tolist() if done: q_values[a] = r else: q_values_next = self.target_net.predict(ns) q_values[a] = r + net_config.gamma * torch.max(q_values_next).item() targets.append(q_values) return states, targets def updato(self): states, targets = self.make_batch() self.main_net.update(states, targets) def upodato(self, state, reward, next_state, done): def learn(self, episodes = 10000, replay = False): episode = 0 tmp = self.epsilon while (episode < episodes): done = False turn = 0 state = self.env.reset() self.eps_decay = self.epsilon * self.eps_decay while (done == False): q_values = self.main_net.model(torch.Tensor(state)).tolist() action = self.choose_action(q_values) new_state, reward, done, _ = self.env.step(action) self.memory.add_data((state, action, reward, new_state, done)) state = new_state turn += 1 self.updato() print("turn:", turn) episode += 1 if (episode % net_config.n_update == 0): self.target_net = deepcopy(self.main_net) self.epsilon = tmp def save(self): pass if __name__ == "__main__": logging.basicConfig(level=logging.INFO) Cutie = CuteLearning() Cutie.learn()
class CuteLearning(): def __init__(self): self.plot_data = PlotData() self.cart = CartPoleEnv() self.cart.reset() self.predi_net = DQN() self.updat_net = deepcopy(self.predi_net) self.turn = 0 self.epidode = 0 self.epsilon = config.epsilon self.eps_decay = 0.99 self.visu = False self.visu_update = False #300 self.visu_window = 5 self.consecutive_wins = 0 self.best_consecutive_wins = 0 self.last_save = 0 self.memory = [] def reward_optimisation(self, state, end): reward = -25 if end else 1 if reward == 1: # Angle reward modification angle_r = 0.418 / 2 reward += (((abs(angle_r - abs(state[2])) / angle_r) * 2) - 1) * 2 # Position reward modification pos_r = 0.418 / 2 reward += (((abs(pos_r - abs(state[0])) / pos_r) * 2) - 1) * 2 return reward def learn(self): self.episode = 0 n = 0 while self.episode < 10: self.turn = 0 end = False states = [] targets = [] while not end: # 1. Init state = self.cart.state # 2. Choose action q_values = self.predi_net.predict(state).tolist() a = choose_action_net(q_values, self.epsilon) # 3. Perform action next_state, _, end, _ = self.cart.step(a) # 4. Measure reward reward = self.reward_optimisation(next_state, end) q_values_next = self.predi_net.predict(next_state) # 5. Calcul Q-Values q_values[a] = reward + net_config.gamma * \ torch.max(q_values_next).item() self.turn += 1 self.memory.append((state, a, next_state, reward, end)) # self.updat_net.update(state, q_values) states.append(state) targets.append(q_values) if (self.turn % 20 and self.turn) or end: self.updat_net.update(states, targets) states = [] targets = [] if self.turn >= 500: end = True if self.visu: self.cart.render() self.episode += 1 self.replay(20) if self.episode % net_config.n_update == 0 and self.episode: print("Update") self.predi_net.model.load_state_dict( self.updat_net.model.state_dict()) self.end() n += 1 self.save() self.cart.close() self.plot_data.clear() def replay(self, size): if size > len(self.memory): size = len(self.memory) data = random.sample(self.memory, size) states = [] targets = [] for state, action, next_state, reward, done in data: q_values = self.predi_net.predict(state) if done: q_values[action] = reward else: # The only difference between the simple replay is in this line # It ensures that next q values are predicted with the target network. q_values_next = self.predi_net.predict(next_state) q_values[action] = reward + net_config.gamma * torch.max( q_values_next).item() states.append(state) targets.append(q_values) self.updat_net.update(state, q_values) def end(self): self.plot_data.new_data(self.turn) if self.turn > 195: self.consecutive_wins += 1 if self.best_consecutive_wins < self.consecutive_wins: self.best_consecutive_wins = self.consecutive_wins if self.consecutive_wins > 200: self.save() print(("WIN IN " + str(self.episode) + " EPISODES\n") * 100) else: self.consecutive_wins = 0 if self.last_save * 1.2 < self.best_consecutive_wins and 50 <= self.best_consecutive_wins: self.save() self.last_save = self.best_consecutive_wins print("Episode: ", self.episode, "\tTurn:", self.turn, "\tEpsilon:", self.epsilon, "\tWins: ", "{:3}".format(self.consecutive_wins), "/", self.best_consecutive_wins) self.turn = 0 self.cart.reset() if self.episode % config.graph_update == 0 and self.episode != 0: self.plot_data.graph() if self.visu_update: if self.episode % self.visu_update == 0: self.visu = True if self.episode % self.visu_update == self.visu_window: self.visu = False self.cart.close() self.epsilon = max(self.epsilon * self.eps_decay, 0.01) def save(self): pass
import gym from cartpole import CartPoleEnv env = CartPoleEnv() observation = env.reset() total_reward = 0 print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) for t in range(200): env.render() action = env.action_space.sample() observation, reward, done, info = env.step(action) total_reward += reward if done: break env.close() print(total_reward)
agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False action_record = [] state_record = [] for i in range(episode_count): ob = env.reset() #action = agent.act(ob, reward, done) traj_action = [] traj_state = [] while True: action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) traj_action.append(action) traj_state.append(ob) print(done) #env.render() if done: break action_record.append(traj_action) state_record.append(traj_state) action_record = np.asarray(action_record) state_record = np.asarray(state_record) record = {'actions': action_record, 'states':state_record} record_file = open('record.pickle', 'wb') pickle.dump(record, record_file) # Note there's no env.render() here. But the environment still can open window and