max_index_list.clear() max_value = value max_index_list.append(index) elif value == max_value: max_index_list.append(index) return random.choice(max_index_list) if __name__ == "__main__": env = Env() agent = QLearningAgent(actions=list(range(env.n_actions))) for episode in range(1000): state = env.reset() while True: env.render() # take action and proceed one step in the environment action = agent.get_action(str(state)) next_state, reward, done = env.step(action) # with sample <s,a,r,s'>, agent learns new q function agent.learn(str(state), action, reward, str(next_state)) state = next_state env.print_value_all(agent.q_table) # if episode ends, then break if done: break
max_index_list.append(index) return random.choice(max_index_list) if __name__ == "__main__": env = Env() agent = SARSAgent(actions=list(range(env.n_actions))) for episode in range(1000): # reset environment and initialize state state = env.reset() # get action of state from agent action = agent.get_action(str(state)) while True: env.render() # take action and proceed one step in the environment next_state, reward, done = env.step(action) next_action = agent.get_action(str(next_state)) # with sample <s,a,r,s',a'>, agent learns new q function agent.learn(str(state), action, reward, str(next_state), next_action) state = next_state action = next_action # print q function of all states at screen env.print_value_all(agent.q_table) # if episode ends, then break
def main(): expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_1.p', "rb")) demonstrations = np.array(expert_demo[0]) print("demonstrations.shape", demonstrations.shape) print(expert_demo[1]) print(expert_demo[0]) print(np.array(expert_demo[0]).shape) # expert_x = int(expert_demo[1][0]) # expert_y = int(expert_demo[1][1]) expert_x = int(expert_demo[0][0]) expert_y = int(expert_demo[0][1]) env = Env(expert_x, expert_y) # env.seed(args.seed) # torch.manual_seed(args.seed) num_inputs = 6 num_actions = 8 running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) vdb = VDB(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate) # load demonstrations k = 1 writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) vdb.load_state_dict(ckpt['vdb']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): # expert_demo = pickle.load(open('./paper/{}.p'.format((iter+1)%expert_sample_size), "rb")) print(iter) expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_{}.p'.format(np.random.randint(1,50)), "rb")) tmp = expert_demo.pop(-1) demonstrations = np.array(expert_demo) print(demonstrations, demonstrations.shape) tot_sample_size = len(demonstrations) + 10 ########################## actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] # while steps < args.total_sample_size: while steps < tot_sample_size: # env.delete_graph() state = env.reset() # time.sleep(1) score = 0 # state = running_state(state) state1 = state for _ in range((tot_sample_size+1)*2): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) irl_reward = get_reward(vdb, state, action) # ###### 동영상 촬영용 # if iter > 11500 : # time.sleep(0.015) # ##### if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) # next_state = running_state(next_state) state = next_state score += reward if done: break ########################## env.draw_graph() env.render() ########################## episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), vdb.train() if train_discrim_flag: expert_acc, learner_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'vdb': vdb.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) #### score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_' + 'last_model' + '.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'vdb': vdb.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
def run(self): global episode # 환경 생성 env = Env(mode=False, show=False) step = 0 while episode < EPISODES: done = False score = 0 observe = env.reset() next_observe = observe # 0~30 상태동안 정지 for _ in range(random.randint(1, 30)): observe = next_observe _, _ = env.step(0) next_observe = env.render(show=False) state = pre_processing(observe) history = np.stack( (state, state, state, state, state, state, state, state), axis=2) history = np.reshape([history], (1, 84, 84, 8)) while not done: step += 1 self.t += 1 observe = next_observe self.image = observe action, policy = self.get_action(history) # 0: 정지, 1: up, 2: right, 3: down, 4: left # 선택한 행동으로 한 스텝을 실행 reward, done = env.step(action) next_observe = env.render(show=False) # 각 타임스텝마다 상태 전처리 (possibly crashes) try: next_state = pre_processing(observe) except: next_state = state print("Error is catched!!") next_state = np.reshape([next_state], (1, 84, 84, 1)) next_history = np.append(next_state, history[:, :, :, :7], axis=3) # 정책의 최대값 self.avg_p_max += np.amax( self.actor.predict(np.float32(history / 255.))) score += reward #reward = np.clip(reward, -1., 1.) # 샘플을 저장 self.append_sample(history, action, reward) history = next_history # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행 if self.t >= self.t_max or done: self.train_model(done) self.update_local_model() self.t = 0 if done: # 각 에피소드 당 학습 정보를 기록 episode += 1 print("episode:", episode, " score:", score, " step:", step, " actor:#", self.id) stats = [score, self.avg_p_max / float(step), step] for i in range(len(stats)): self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) summary_str = self.sess.run(self.summary_op) self.summary_writer.add_summary(summary_str, episode + 1) self.avg_p_max = 0 self.avg_loss = 0 step = 0
if __name__ == "__main__": logger = get_logger('td.log') env = Env() agent = TDAgent(actions=list(range(env.n_actions))) MAX_EPISODES = 1000 # 최대 에피소드 수 success_cnt = 0 fail_cnt = 0 total_step = 0 for episode in range(MAX_EPISODES): state = env.reset() # 에피소드 시작 : 환경을 초기화하고, 상태 = 초기상태로 설정 while True: env.render() # 화면 그리기 action = agent.get_action(state) next_state, reward, done = env.step(action) agent.update(state, next_state, reward, done) state = next_state total_step += 1 if done: # 마지막 상태 가치함수 업데이트 agent.update(next_state, next_state, reward, done)
step = 0 while episode < EPISODES: tick = time.clock() done = False score = 0 observe = env.reset() next_observe = observe for _ in range(random.randint(1, 20)): observe = next_observe _, _ = env.step(0) next_observe = env.render() dt = time.clock() - tick time.sleep((0.05 - dt) if (0.05 - dt) > 0 else 0) tick = time.clock() state = pre_processing(observe) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 84, 84, 4)) while not done: dt = time.clock() - tick step += 1 observe = next_observe
while not done: # take action action = network.forward(state) # if args.env == 'mount' and action == 1: action += 1 next_state, reward, done, _ = env.step(action) ep_reward += reward # distribute reward to all neurons network.distribute_task_reward(reward) # must come after distribute network.store(False) if args.render and e // 250 == 1: env.render() state = next_state if args.neuron_type == 'DQN': # hack to get state,next_state to play nicely in DQN action = network.forward(next_state) # distribute reward to all neurons network.distribute_task_reward(reward) # must come after distribute network.store(done=True) # save episode rewards network.end_episode( ep_reward )