def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.reinforce: agent = REINFORCE.Agent(args, env.observation_space.shape[0], env.action_space) else: agent = importance_sampling.Agent(args, env.observation_space.shape[0], env.action_space) trajs = [] result = [] for i_episode in range(args.num_episodes): s_t = torch.Tensor([env.reset()]) states = [] actions = [] log_probs = [] rewards = [] for t in range(args.max_steps): a_t, log_prob = agent.action(s_t) s_t1, r_t, done, _ = env.step(a_t.numpy()[0][0]) states.append(s_t) actions.append(a_t) log_probs.append(log_prob) rewards.append(r_t) s_t = torch.Tensor([s_t1]) if done: break if len(trajs) >= args.num_trajs: trajs.pop(0) if args.reinforce: ##use most recent trajectory only trajs = [] trajs.append((states, actions, rewards, log_probs)) agent.train_(trajs) print("Episode: {}, reward: {}".format(i_episode, sum(rewards))) result.append(sum(rewards)) """plot""" plt.plot(range(len(result)), result) plt.ylabel('reward') plt.xlabel('episodes') plt.grid(True) plt.show() env.close()
def main(): """ Main script. Default environment: CartPole-v0 """ parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--num_episodes', type=int, default=1e3) parser.add_argument('--render', type=bool, default=True) args = parser.parse_args() with tf.Session() as sess: agent = REINFORCE(args, sess) print("Training agent...\n") agent.train() print("Training completed successfully.\nPrinting Results.\n") agent.print_results()
def main(): a = Servo.servo() b = degree_gyro_q_l.acc() global count global init_pwm_1 global init_pwm_2 global start_time global memory_degree global memory_ang_vel global memory_acc_degree global memory_semaphore global sess global model_load global done_episode global np_PG_data max_episodes = 2000 pwm_1 = init_pwm_1 pwm_2 = init_pwm_2 #init = tf.global_variables_initializer() sess = tf.Session() if True: #pdb.set_trace() agent = REINFORCE.REINFORCEAgnet(sess, input_size, output_size, name="main") if not model_load: tf.global_variables_initializer().run(session=sess) else: saver = tf.train.Saver() saver.restore(sess, "./TF_Data/"+sys.argv[1]) print "'%s' model is loaded" % (sys.argv[1]) for episode in range(max_episodes): print "new episodes initializaion" done = False done_episode = False score = 0 pwm_left = init_pwm_1 pwm_right = init_pwm_2 timer = threading.Timer(10, done_timer).start() print "\n\n" while not done: memory_semaphore.acquire(10) degree = memory_degree.read() acc_gyro_pitch = float(degree.rstrip('\x00')) ang_vel = memory_ang_vel.read() p_ang_vel = float(ang_vel.rstrip('\x00')) acc_degree = memory_acc_degree.read() acc_pitch = float(acc_degree.rstrip('\x00')) memory_semaphore.release() state = np.array([acc_gyro_pitch, p_ang_vel]) print "\t\t\t<state> degree: %s, \tangular velocity: %s" %(state[0], state[1]) #state = np.reshape(state, [1, 4]) action = agent.predict(state) pwm_left, pwm_right = step_action(action, pwm_left, pwm_right) pwm_left, pwm_right = safe_pwm(pwm_left, pwm_right) print "\t\t\t\t\t\t\t\t<action-motor> left: %s, right: %s <= %s" % (pwm_left, pwm_right, action_print(action)) a.servo_1(pwm_left) a.servo_2(pwm_right) time.sleep(0.05) ## Get new state and reward from environment memory_semaphore.acquire(10) degree = memory_degree.read() acc_gyro_pitch = float(degree.rstrip('\x00')) ang_vel = memory_ang_vel.read() p_ang_vel = float(ang_vel.rstrip('\x00')) acc_degree = memory_acc_degree.read() acc_pitch = float(acc_degree.rstrip('\x00')) memory_semaphore.release() next_state = np.array([acc_gyro_pitch, p_ang_vel]) print "\t\t\t<next-state> degree: %s, \tangular velocity: %s" %(next_state[0], next_state[1]) reward = reward_check(next_state) #reward = reward_check(state, next_state) if done_episode == True: done = done_episode agent.append_sample(state, action, reward) score += reward #state = copy.deepcopy(next_state) if done: loss = agent.update(keep_prob=0.7) if episode == 0: np_PG_data = np.array([[episode, loss, score]]) else: np_PG_data = np.append(np_PG_data, [[episode, loss, score]], axis=0) score = round(score, 2) print "episode: %s loss: %s score: %s" %(episode, loss ,score) time.sleep(3)
# In[1]: import Cliff_Walk_Environment import REINFORCE # In[2]: #Defining the hyper-parameters NUMBER_OF_EPISODES_TO_CONSIDER = 10000 GAMMA = 0.9 # In[3]: env = Cliff_Walk_Environment.Cliff_Walking_Environment(START_STATE='36', END_STATE='47') reinforce_agent = REINFORCE.REINFORCE_Agent(env=env) # In[4]: episode_lengths = [] episode_rewards = [] for episode_iterator in range(NUMBER_OF_EPISODES_TO_CONSIDER): episode = reinforce_agent.generateEpisode() episode_lengths.append(len(episode)) episode_rewards.append(reinforce_agent.getDiscountedReturn(episode)) #print(reinforce_agent.weights) #env.printEnvironment(episode) if (episode_iterator % 500) == 0 and not (episode_iterator == 0): print('Episodes done', str(episode_iterator))
import REINFORCE import sys stepSize = 0.05 numHidden = 1 hiddenSize = 16 maxEpisode = 50000 activation = 'relu' try: run = sys.argv[1] except: run = 0 batchEpisodeNumber = 10 REINFORCE.learn(run, stepSize, numHidden, maxEpisode, activation, hiddenSize, batchEpisodeNumber)