def objective(trial): file = pd.ExcelFile(r'sasd_a2c.xlsx') state_index_oh = file.parse('state_index') MaxEpisodes = 2000 Env = env() EPSILON = 1 Total_Reward = [] Avg_Rewards = [] # output1_lst = [] # output2_lst = [] # input1_lst = [] # input2_lst = [] fc1_dims = trial.suggest_categorical('fc1_dims', [15, 20, 30]) lr = trial.suggest_uniform("lr", 5e-6, 1e-4) gamma = trial.suggest_categorical("gamma", [0.97, 0.98, 0.99]) lr_ns = trial.suggest_uniform("lr_ns", 1e-4, 1e-2) lr_r = trial.suggest_uniform("lr_r", 1e-4, 5e-3) lr_d = trial.suggest_uniform("lr_d", 1e-4, 1e-3) agent1 = Agent(state_size=9, action_size=10, fc1_dims=fc1_dims, lr=lr, batch_size=64, buffer_size=100000, gamma=gamma, tau=0.002, lr_ns=lr_ns, lr_r=lr_r, lr_d=lr_d) #fc1=32 lr=0.0009 gamma=0,98 agent2 = Agent(state_size=9, action_size=10, fc1_dims=fc1_dims, lr=lr, batch_size=64, buffer_size=100000, gamma=gamma, tau=0.002, lr_ns=lr_ns, lr_r=lr_r, lr_d=lr_d) writer = SummaryWriter() writer.add_graph( agent1.q_network, torch.from_numpy(state_index_oh.iloc[:, 2:].values).float()) writer.add_graph( agent2.q_network, torch.from_numpy(state_index_oh.iloc[:, 2:].values).float()) writer.close() agent1.memory.buffer_reset() agent2.memory.buffer_reset() for ep in range(MaxEpisodes): state = Env.reset() #torch.zeros(1) # agent.memory.buffer_reset() done = False stepscounter = 0 ep_reward = 0 state_OH = state_index_oh.iloc[state.int().numpy(), 2:].values.reshape(-1) while not done: stepscounter += 1 action1 = agent1.act(state_OH, EPSILON) action = action1 * 10 new_state, reward, done, obs = Env.next_state(action) ep_reward += reward new_state_OH = state_index_oh.iloc[new_state.int().numpy(), 2:].values.reshape(-1) agent1.memory.store_transition( state_OH, action1, 2 * ((reward.item() + 50) / 100) - 1, new_state, done) state_OH = new_state_OH if done == True: break action2 = agent2.act(state_OH, EPSILON) if action2 == action1: continue action = action2 new_state, reward, done, obs = Env.next_state(action) ep_reward += reward new_state_OH = state_index_oh.iloc[new_state.int().numpy(), 2:].values.reshape(-1) agent2.memory.store_transition( state_OH, action2, 2 * ((reward.item() + 50) / 100) - 1, new_state, done) state_OH = new_state_OH agent1.learn() agent2.learn() update_model1 = agent1.train_model(1) update_model2 = agent2.train_model(2) for _ in range(5): agent1.sim_learn(1) agent2.sim_learn(2) output1 = obs[0].item() output2 = obs[1].item() input1 = obs[2].item() input2 = obs[3].item() EPSILON = epsilon_decay(eps=EPSILON) Total_Reward.append(ep_reward) avg_reward = np.mean(Total_Reward[-100:]) Avg_Rewards.append(avg_reward) if ep % 1 == 0: totalresult = 'episode: ' + str( ep + 1 ) + ' Total_Reward %.2f' % ep_reward + ' Average_Reward %.2f' % avg_reward + ' Steps ' + str( stepscounter) + ' Model Training Data: ' + str( update_model1) + str( update_model2 ) #+' Output1: '+str(output1)+' Output2: '+str(output2) # dataCollect("Total Result",Total_Result,totalresult,i_episode) print(f'\r{totalresult}', end='\r') writer.add_scalar('reward/episode', ep_reward, ep) writer.add_scalar('Avgreward/episode', avg_reward, ep) writer.add_scalar('output1/episode', output1, ep) writer.add_scalar('output2/episode', output2, ep) writer.add_scalar('input1/episode', input1, ep) writer.add_scalar('input2/episode', input2, ep) trial.report(avg_reward, ep) if trial.should_prune(): raise optuna.exceptions.TrialPruned() return avg_reward
input_dims=[input_dims], lr=0.003) scores, avg_scores, eps_history = [], [], [] epochs = 500 for epoch in range(epochs): score = 0 done = False state_old = env.reset() # print(state_old[0].type) while not done: # iterating over every timestep (state) env.render() action = agent.choose_action(state_old) state_new, reward, done, info = env.step(action) score += reward agent.store_transition(state_old, action, reward, state_new, done) agent.learn() state_old = state_new scores.append(score) eps_history.append(agent.epsilon) avg_score = np.mean(scores[-100:]) avg_scores.append(avg_score) print("epoch: ", epoch, "score: %.2f " % score, "avg_score: %.2f " % avg_score, "epsilon: %.2f" % agent.epsilon) simple_plot(scores, avg_scores, epoch) env.close()