"--memory-size", dest="memory_size", default=5000, type=int, help='Numpber of memory size (default: 5000)') args = parser.parse_args() start_date = utils.format(args.start_date) end_date = utils.format(args.end_date) env = Env(start_date, end_date) agent = Agent(env.actions, len(env.columns), env.state_size, args.memory_size) if args.load: print("[Agent] load model") agent.load_model() terminal = False n_epochs = args.n_epochs loops = -1 e = 0 total_frame = 0 do_replay_count = 0 max_step = 0 start_replay = False optimistic = 3 optimistic_num = len(env.actions) * optimistic def optimistic_action(env, epoch, optimistic):
from dqn_agent import ReplayBuffer from dqn_agent import Agent LOAD_PREV_MODEL = False SET_EPS = False MODEL_FILE_NAME = "MODEL_CHECKPOINT.34994.model" max_games = 1000 max_steps = 1000 my_agent = Agent(37, action_size, 0) if LOAD_PREV_MODEL: my_agent.load_model(MODEL_FILE_NAME, 37, action_size, 0) #my_buffer = ReplayBuffer(4, 1000, 10, 0) epsilon = 0.0 min_eps = 0.1 all_scores = [] #env_info = env.reset(train_mode=False)[brain_name] # reset the environment #state = env_info.vector_observations[0] # get the current state #score = 0 # initialize the score for idx_games in range(1, max_games): env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score print("GAME NUMBER: " + str(idx_games))
def main(limit_episode, limit_step, seed=0, load=False): rospy.init_node('dqn_pendulum_myrrbot7') pub = rospy.Publisher("/myrrbot7/joint1_cotroller/command", Float64, queue_size=1) loop_rate = rospy.Rate(hz) result_path = "/home/amsl/ros_catkin_ws/src/deep_actor_critic/actor_critic_for_swingup/test_results/results/dqn_result.txt" model_path = "/home/amsl/ros_catkin_ws/src/deep_actor_critic/actor_critic_for_swingup/test_results/models/dqn_" init_theta = 0.0 init_omega = 0.0 state = get_state(init_theta, init_omega) state_dash = get_state(init_theta, init_omega) action = 0.0 reward = 0.0 action_list = [ np.array([a], dtype=np.float64) for a in [min_torque, max_torque] ] n_st = len(state[0]) n_act = len(action_list) Q_list = np.array([]) max_Q = 0.0 ave_Q = 0.0 reward_list = np.array([]) ave_reward = 0.0 total_reward = 0.0 temp_result = np.array([[]]) test_result = np.array([[]]) evaluation_flag = False wait_flag = True agent = Agent(n_st, n_act, seed) if load: agent.load_model(model_path) episode_count = 0 time = 1 count = 0 wait_count = 0 while not rospy.is_shutdown(): if wait_flag: wait_count += 1 # print "wait_count : ", wait_count state = get_state(init_theta, init_omega) state_dash = get_state(init_theta, init_omega) reset_client(init_theta) action = 0.0 pub.publish(action) if wait_count == 0.5 * hz: init_theta = uniform(-1 * math.pi, math.pi) if wait_count % hz == 0: wait_count = 0 wait_flag = False # print "Please Wait 1 second" # print "state : ", state # print "state_dash : ", state_dash else: if not evaluation_flag: # print "Now Learning!!!!!" # print "episode : ", episode_count # print "time : ", time # print "state : ", state act_i, q = agent.get_action(state, False) Q_list = np.append(Q_list, [q]) # print "act_i : ", act_i action = action_list[act_i] # print "action : ", action pub.publish(action) theta, omega = get_joint_properties('myrrbot7_joint1') # print "theta : %f, omega : %f" % (theta, omega) state_dash = get_state(theta, omega) # print "state_dash : ", state_dash reward = get_reward(theta, omega, action) reward_list = np.append(reward_list, [reward]) # print "reward : ", reward ep_end = get_ep_end(theta, time, limit_step) # print "ep_end : ", ep_end agent.stock_experience(count, state, act_i, reward, state_dash, ep_end) agent.train(count) time += 1 count += 1 if ep_end: max_Q = np.max(Q_list) ave_Q = np.average(Q_list) # print "max_Q : ", max_Q # print "ave_Q : ",ave_Q ave_reward = np.average(reward_list) total_reward = np.sum(reward_list) # print "ave_reward : ", ave_reward # print "total_reward : ", total_reward print "Episode : %d\t/Reward Sum : %f\tEpsilon : %f\tLoss : %f\t/Average Q : %f\t/Time Step : %d" % ( episode_count, total_reward, agent.epsilon, agent.loss, np.sum(Q_list) / float(time), agent.step + 1) Q_list = np.array([]) reward_list = np.array([]) temp_result = np.array(([[ episode_count, max_Q, ave_Q, ave_reward, total_reward ]]), dtype=np.float32) if episode_count == 0: # print "test_result : ", test_result test_result = temp_result # print "test_result : ", test_result else: test_result = np.r_[test_result, temp_result] save_result(result_path, test_result) agent.save_model(model_path) if episode_count % 1 == 0: evaluation_flag = False episode_count += 1 time = 0 wait_flag = True else: # print "Now evaluation!!!" # print "episode : ", episode_count-1 # print "time : ", time # print "state : ", state act_i, q = agent.get_action(state, True) Q_list = np.append(Q_list, [q]) # print "act_i : ", act_i # print "Q_list : ", Q_list action = action_list[act_i] # print "action : ", action pub.publish(action) theta, omega = get_joint_properties('myrrbot7_joint1') # print "theta : %f, omega : %f" % (theta, omega) state_dash = get_state(theta, omega) # print "state_dash : ", state_dash reward = get_reward(theta, omega, action) # print "reward : ", reward reward_list = np.append(reward_list, [reward]) # print "reward_list : ", reward_list ep_end = get_ep_end(theta, time, limit_step) # print "ep_end : ", ep_end if ep_end: max_Q = np.max(Q_list) ave_Q = np.average(Q_list) # print "max_Q : ", max_Q # print "ave_Q : ",ave_Q ave_reward = np.average(reward_list) total_reward = np.sum(reward_list) # print "ave_reward : ", ave_reward # print "total_reward : ", total_reward print "Episode : %d\t/Reward Sum : %f\tEpsilon : %f\tLoss : %f\t/Average Q : %f\t/Time Step : %d" % ( episode_count - 1, total_reward, agent.epsilon, agent.loss, np.sum(Q_list) / float(time + 1), agent.step) Q_list = np.array([]) reward_list = np.array([]) time = 0 wait_flag = True evaluation_flag = False temp_result = np.array(([[ episode_count - 1, max_Q, ave_Q, ave_reward, total_reward ]]), dtype=np.float32) if episode_count - 1 == 0: # print "test_result : ", test_result test_result = temp_result # print "test_result : ", test_result else: test_result = np.r_[test_result, temp_result] save_result(result_path, test_result) agent.save_model(model_path) time += 1 loop_rate.sleep()
# examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) ################################## ########### LOAD AGENT ########### ################################## agent = Agent(state_size=state_size, action_size=action_size, dueling=dueling, seed=0) agent.load_model(path_to_model) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state, 0) # select an action env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break