コード例 #1
0
def main():

    env = gym.make(args.env_name)

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.reinforce:
        agent = REINFORCE.Agent(args, env.observation_space.shape[0],
                                env.action_space)
    else:
        agent = importance_sampling.Agent(args, env.observation_space.shape[0],
                                          env.action_space)

    trajs = []
    result = []

    for i_episode in range(args.num_episodes):

        s_t = torch.Tensor([env.reset()])

        states = []
        actions = []
        log_probs = []
        rewards = []

        for t in range(args.max_steps):
            a_t, log_prob = agent.action(s_t)
            s_t1, r_t, done, _ = env.step(a_t.numpy()[0][0])
            states.append(s_t)
            actions.append(a_t)
            log_probs.append(log_prob)
            rewards.append(r_t)
            s_t = torch.Tensor([s_t1])

            if done:
                break

        if len(trajs) >= args.num_trajs:
            trajs.pop(0)

        if args.reinforce:
            ##use most recent trajectory only
            trajs = []

        trajs.append((states, actions, rewards, log_probs))
        agent.train_(trajs)

        print("Episode: {}, reward: {}".format(i_episode, sum(rewards)))
        result.append(sum(rewards))
    """plot"""
    plt.plot(range(len(result)), result)
    plt.ylabel('reward')
    plt.xlabel('episodes')
    plt.grid(True)
    plt.show()

    env.close()
コード例 #2
0
def main():
    """
        Main script.
        Default environment: CartPole-v0
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--num_episodes', type=int, default=1e3)
    parser.add_argument('--render', type=bool, default=True)
    args = parser.parse_args()

    with tf.Session() as sess:
        agent = REINFORCE(args, sess)
        print("Training agent...\n")
        agent.train()
        print("Training completed successfully.\nPrinting Results.\n")
        agent.print_results()
コード例 #3
0
def main():
	a = Servo.servo()
    	b = degree_gyro_q_l.acc()
	global count
	global init_pwm_1
	global init_pwm_2
	global start_time
	global memory_degree
	global memory_ang_vel    	
	global memory_acc_degree	
	global memory_semaphore	
	global sess	
	global model_load
	global done_episode
	global np_PG_data


	max_episodes = 2000
		
	pwm_1 = init_pwm_1
	pwm_2 = init_pwm_2

	
	#init = tf.global_variables_initializer()	
	sess = tf.Session()
	if True:
		#pdb.set_trace()
		agent = REINFORCE.REINFORCEAgnet(sess, input_size, output_size, name="main")
		if not model_load:
			tf.global_variables_initializer().run(session=sess)
		else:
			saver = tf.train.Saver()
			saver.restore(sess, "./TF_Data/"+sys.argv[1]) 	
			print "'%s' model is loaded" % (sys.argv[1])	

		for episode in range(max_episodes):

			print "new episodes initializaion"
			done = False
            		done_episode = False
			score = 0

			pwm_left = init_pwm_1
			pwm_right = init_pwm_2
			
			timer = threading.Timer(10, done_timer).start()
			print "\n\n"	
			while not done:				
				memory_semaphore.acquire(10)
				degree = memory_degree.read()
                        	acc_gyro_pitch = float(degree.rstrip('\x00'))
				ang_vel = memory_ang_vel.read()
                        	p_ang_vel = float(ang_vel.rstrip('\x00'))
				acc_degree = memory_acc_degree.read()
				acc_pitch = float(acc_degree.rstrip('\x00'))
				
				memory_semaphore.release()
				state = np.array([acc_gyro_pitch, p_ang_vel])
				
				print "\t\t\t<state> degree: %s, \tangular velocity: %s" %(state[0],  state[1])
				#state = np.reshape(state, [1, 4])

				action = agent.predict(state)
				
				pwm_left, pwm_right = step_action(action, pwm_left, pwm_right)
				pwm_left, pwm_right = safe_pwm(pwm_left, pwm_right)
				
				print "\t\t\t\t\t\t\t\t<action-motor> left: %s, right: %s <= %s" % (pwm_left, pwm_right, action_print(action))
				a.servo_1(pwm_left)
				a.servo_2(pwm_right)
				
				time.sleep(0.05)
				
				## Get new state and reward from environment
				memory_semaphore.acquire(10)
				degree = memory_degree.read()
                                
                                acc_gyro_pitch = float(degree.rstrip('\x00'))
  	                        ang_vel = memory_ang_vel.read()
                                p_ang_vel = float(ang_vel.rstrip('\x00'))
                                acc_degree = memory_acc_degree.read()
                                acc_pitch = float(acc_degree.rstrip('\x00'))
				
				memory_semaphore.release()
				next_state = np.array([acc_gyro_pitch, p_ang_vel])
				print "\t\t\t<next-state> degree: %s, \tangular velocity: %s" %(next_state[0], next_state[1])	
				reward = reward_check(next_state)
				#reward = reward_check(state, next_state)
				if done_episode == True:
					done = done_episode
				
				agent.append_sample(state, action, reward)
				score += reward
				#state = copy.deepcopy(next_state)

				if done:
					loss = agent.update(keep_prob=0.7)
					if episode == 0:
						np_PG_data = np.array([[episode, loss, score]])
					else:	
	 					np_PG_data = np.append(np_PG_data, [[episode, loss, score]], axis=0)
                    			score = round(score, 2)
                    			print "episode: %s  loss: %s  score: %s" %(episode, loss ,score)
					time.sleep(3)
# In[1]:

import Cliff_Walk_Environment
import REINFORCE

# In[2]:

#Defining the hyper-parameters
NUMBER_OF_EPISODES_TO_CONSIDER = 10000
GAMMA = 0.9

# In[3]:

env = Cliff_Walk_Environment.Cliff_Walking_Environment(START_STATE='36',
                                                       END_STATE='47')
reinforce_agent = REINFORCE.REINFORCE_Agent(env=env)

# In[4]:

episode_lengths = []
episode_rewards = []
for episode_iterator in range(NUMBER_OF_EPISODES_TO_CONSIDER):
    episode = reinforce_agent.generateEpisode()

    episode_lengths.append(len(episode))
    episode_rewards.append(reinforce_agent.getDiscountedReturn(episode))
    #print(reinforce_agent.weights)
    #env.printEnvironment(episode)

    if (episode_iterator % 500) == 0 and not (episode_iterator == 0):
        print('Episodes done', str(episode_iterator))
コード例 #5
0
import REINFORCE
import sys

stepSize = 0.05
numHidden = 1
hiddenSize = 16
maxEpisode = 50000
activation = 'relu'

try:
    run = sys.argv[1]
except:
    run = 0

batchEpisodeNumber = 10

REINFORCE.learn(run, stepSize, numHidden, maxEpisode, activation, hiddenSize,
                batchEpisodeNumber)