Beispiel #1
0
class Worker(mp.Process):
    def __init__(self, global_network, optimizer, global_ep, global_ep_r,
                 res_queue, worker_name, pybullet_client, urdf_path):
        super(Worker, self).__init__()
        self.device = 'cpu'
        if torch.cuda.is_available(): self.device = 'cuda'
        self.worker_name = 'worker_%i' % worker_name
        self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue

        self.global_network = global_network
        self.optimizer = optimizer
        # self.env = gym.make('Pendulum-v0').unwrapped
        robot = snake.Snake(pybullet_client, urdf_path)
        self.env = SnakeGymEnv(robot)

        self.local_network = model.Network(self.env.observation_space.shape[0],
                                           self.env.action_space.shape[0])
        # self.local_optimizer = torch.optim.Adam(lr=lr)

    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            state = self.env.reset()
            buffer_states, buffer_actions, buffer_rewards = [], [], []
            episode_reward = 0.0

            for t in range(MAX_EPISODE_STEPS):
                state = torch.FloatTensor(state).to(self.device)
                action = self.local_network.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)

                if t == MAX_EPISODE_STEPS - 1:
                    done = True

                episode_reward += reward
                buffer_actions.append(action)
                buffer_states.append(state)
                buffer_rewards.append(reward)  # normalize

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    # sync
                    push_and_pull(self.optimizer, self.local_network,
                                  self.global_network, done, next_state,
                                  buffer_states, buffer_actions,
                                  buffer_rewards, GAMMA)
                    buffer_states, buffer_actions, buffer_rewards = [], [], []

                    # if done:  # done and print information
                    record(self.g_ep, self.g_ep_r, episode_reward,
                           self.res_queue, self.worker_name)
                    # break
                state = next_state
                total_step += 1

        self.res_queue.put(None)


# if __name__ == '__main__':
# 	print("Cannot test easily!!")
Beispiel #2
0
    def __init__(self, global_network, optimizer, global_ep, global_ep_r,
                 res_queue, worker_name, pybullet_client, urdf_path):
        super(Worker, self).__init__()
        self.device = 'cpu'
        if torch.cuda.is_available(): self.device = 'cuda'
        self.worker_name = 'worker_%i' % worker_name
        self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue

        self.global_network = global_network
        self.optimizer = optimizer
        # self.env = gym.make('Pendulum-v0').unwrapped
        robot = snake.Snake(pybullet_client, urdf_path)
        self.env = SnakeGymEnv(robot)

        self.local_network = model.Network(self.env.observation_space.shape[0],
                                           self.env.action_space.shape[0])
Beispiel #3
0
def create_env(p, args):  #creates a single environment for testing
    urdf_path = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf")
    robot = snake.Snake(p, urdf_path, args=args)
    return SnakeGymEnv(robot, args=args)
Beispiel #4
0
 def _thunk():
     robot = snake.Snake(p, urdf_path, args=args)
     env_snake = SnakeGymEnv(robot, args=args)
     return env_snake
Beispiel #5
0
def create_env(p, args):
    urdf_path = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf")
    robot = snake.Snake(p, urdf_path, args=args)
    return SnakeGymEnv(robot, args=args)
Beispiel #6
0
import matplotlib.pyplot as plt

import pybullet as p
import sys
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(BASE_DIR, os.pardir))
from SnakeGymEnv import SnakeGymEnv
import snake
from datetime import datetime

p.connect(p.DIRECT)
# Create env for policy testing.
urdf_path = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf")
robot = snake.Snake(p, urdf_path)
env = SnakeGymEnv(robot)
# env = gym.make('Pendulum-v0')

if __name__ == '__main__':
    global_network = model.Network(env.observation_space.shape[0],
                                   env.action_space.shape[0])

    global_network.share_memory()
    optimizer = SharedAdam(global_network.parameters(), lr=0.0002)
    global_ep, global_ep_r, res_queue = mp.Value('i',
                                                 0), mp.Value('d',
                                                              0.), mp.Queue()

    workers = [
        Worker(global_network, optimizer, global_ep, global_ep_r, res_queue, i,
               p, urdf_path) for i in range(16)
Beispiel #7
0
 def create_env(self, urdf_path):
     self.robot = snake.Snake(p, urdf_path)
     self.env = SnakeGymEnv(self.robot)
Beispiel #8
0
class Logger:
    def __init__(self,
                 log_dir,
                 urdf_path=os.path.join(os.pardir, "snake/snake.urdf")):
        self.log_dir = log_dir
        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")

        p.connect(p.GUI)
        self.create_env(urdf_path)
        self.net = self.create_model()

    def create_env(self, urdf_path):
        self.robot = snake.Snake(p, urdf_path)
        self.env = SnakeGymEnv(self.robot)

    def create_model(self):
        # Create network/policy.
        num_inputs = self.env.observation_space.shape[0]
        num_outputs = self.env.action_space.shape[0]
        hidden_size = 256
        net = ActorCritic(num_inputs, num_outputs, hidden_size).to(self.device)
        return net

    def restore_model(self, weight):
        checkpoint = torch.load(weight, map_location='cpu')
        self.net.load_state_dict(checkpoint['model'])

    def test_env(self, file_name, max_step=100):
        state = self.env.reset()
        done = False
        total_reward = 0
        steps = 0

        cdist = 3.0
        cyaw = 90
        cpitch = -89

        basePos = self.robot.getBasePosition()
        p.resetDebugVisualizerCamera(cameraDistance=cdist,
                                     cameraYaw=cyaw,
                                     cameraPitch=cpitch,
                                     cameraTargetPosition=basePos)

        loggingId = p.startStateLogging(p.STATE_LOGGING_VIDEO_MP4, file_name)
        while steps < max_step:
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            dist, _ = self.net(state)
            next_state, reward, done, _ = self.env.step(
                dist.sample().cpu().numpy()[0])  #Hack
            print("Step No: {:3d}, Reward: {:2.3f} and done: {}".format(
                steps, reward, done))
            state = next_state
            total_reward += reward
            steps += 1
        p.stopStateLogging(loggingId)

    # Uses all the models in the folder 'log/models'
    def log_all_videos(self):
        weights = os.listdir(os.path.join(
            log_dir, 'models'))  # Finds all the .pth files.
        files = [
            os.path.join(log_dir, 'models', w[:-4] + '.mp4') for w in weights
        ]  # Create names/path for video files.
        weights = [os.path.join(log_dir, 'models', w)
                   for w in weights]  # Store path for all weight files.
        for w, f in zip(weights, files):
            self.restore_model(w)  # Restore model.
            self.test_env(f)  # Record video for 100 steps.

    def log_video(weight, file_name):
        self.restore_model(weight)
        self.test_env(file_name)
Beispiel #9
0
import pybullet as p
import numpy as np
import os
import snake
from SnakeGymEnv import SnakeGymEnv
import time

p.connect(p.GUI)
robot = snake.Snake(p, "snake/snake.urdf")
env = SnakeGymEnv(robot)
obs = env.reset()

print(obs)

R = 0.0
for i in range(60):
    obs, r, d, _ = env.step([0.5] * 8)
    R += r
    print(d)
    env.render()
    print('Reward: {}'.format(r))
    print("ith step ", i)
print("Total Reward: {}".format(R))
time.sleep(5)
Beispiel #10
0
def running_test(log_dir, max_steps=100, create_video=False):
# Parameters
	urdf_path = os.path.join(os.pardir, "snake/snake.urdf")
	hidden_size = [256,256]
	use_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if use_cuda else "cpu")


	# Create test environment.
	if create_video: p.connect(p.GUI)
	else: p.connect(p.GUI)
	cdist = 1.5
	cyaw = -30
	cpitch = -90
	# p.resetDebugVisualizerCamera(cameraDistance=cdist, cameraYaw=cyaw, cameraPitch=cpitch, cameraTargetPosition=[1.28,0,0])
	robot = snake.Snake(p, urdf_path)
	env = SnakeGymEnv(robot)
	robot.mode = 'test'
	env.mode = 'test'

	# Check availability of cuda
	use_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if use_cuda else "cpu")

	# State space and action space
	num_inputs = env.observation_space.shape[0]
	num_outputs = env.action_space.shape[0]

	# Create network/policy.
	net = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)

	checkpoint = torch.load(os.path.join(log_dir,'models/weights_01000.pth'), map_location='cpu')
	net.load_state_dict(checkpoint['model'])

	if create_video: frames = []
	state = env.reset()

	if create_video: frames.append(env.render())
	# if create_video: frames.append(img)
	done = False
	total_reward = 0
	steps = 0
	print_('Test Started...', color='r', style='bold')
	STATES = []
	link_positions = []

	while steps < max_steps:
		state = torch.FloatTensor(state).unsqueeze(0).to(device)
		dist, _ = net(state)
		# print(dist.sample().cpu().numpy()[0])
		next_state, reward, done, info = env.step(dist.sample().cpu().numpy()[0]) #Hack

		# print(info.keys())
		STATES = STATES + info['internal_observations']
		if create_video: frames = frames + info['frames']
		link_positions = link_positions + info['link_positions']

		print("Step No: {:3d}, Reward: {:2.3f} and done: {}".format(steps, reward, done))
		state = next_state
		total_reward += reward
		steps += 1
		STATES.append(state)
	print_('Total Reward: {}'.format(total_reward), color='bl', style='bold')
	print('Test Ended!')
	print(len(link_positions))
	np.savetxt('rl_policy_linkpositions_2.txt', np.array(link_positions))
	if create_video: log_video(frames)
	return STATES

# state = running_test()
# print(state)
# print(type(state))
Beispiel #11
0
def train(args):
	# hyper-params:
	frame_idx  		 = 0
	hidden_size      = args.hidden_size
	lr               = args.lr
	num_steps        = args.num_steps
	mini_batch_size  = args.mini_batch_size
	ppo_epochs       = args.ppo_epochs
	threshold_reward = args.threshold_reward
	max_frames 		 = args.max_frames
	# test_rewards 	 = []
	num_envs 		 = args.num_envs
	test_epochs		 = args.test_epochs
	resume_training	 = args.resume_training
	best_test_reward = 0.0
	urdf_path		 = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf")
	log_dir 		 = args.log_dir

	now = datetime.now()
	log_dir = log_dir + '_' + now.strftime('%d_%m_%Y_%H_%M_%S')

	# Check cuda availability.
	use_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if use_cuda else "cpu")


	p.connect(p.DIRECT)
	writer = SummaryWriter(log_dir)

	# Create training log.
	textio = utils.IOStream(os.path.join(log_dir, 'train.log'), args=args)
	# textio.log_params(device, num_envs, lr, threshold_reward)	
	utils.logFiles(log_dir)

	# create multiple environments.
	envs = [utils.make_env(p, urdf_path, args=args) for i in range(num_envs)]
	envs = SubprocVecEnv(envs)

	# pdb.set_trace()	# Debug
	num_inputs = envs.observation_space.shape[0]
	num_outputs = envs.action_space.shape[0]

	# Create Policy/Network
	net = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
	optimizer = optim.Adam(net.parameters(), lr=lr)

	# If use pretrained policy.
	if resume_training:
		if os.path.exists(resume_training):
			checkpoint = torch.load(resume_training)
			frame_idx = checkpoint['frame_idx']
			net.load_state_dict(checkpoint['model'])
			best_test_reward = checkpoint['best_test_reward']

	# Initial Reset for Environment.
	state = envs.reset()
	early_stop = False

	# Create env for policy testing.
	robot = snake.Snake(p, urdf_path, args=args)
	env = SnakeGymEnv(robot, args=args)

	print_('\nTraining Begins ...', color='r', style='bold')
	textio.log('Training Begins ...')
	while frame_idx < max_frames and not early_stop:
		print_('\nTraining Policy!', color='r', style='bold')
		textio.log('\n############## Epoch: %0.5d ##############'%(int(frame_idx/20)))

		# Memory buffers
		log_probs = []
		values    = []
		states    = []
		actions   = []
		rewards   = []
		masks     = []
		entropy   = 0
		total_reward = 0.0

		for i in range(num_steps):
			print('Steps taken: {} & Epoch: {}\r'.format(i, int(frame_idx/20)), end="")
			state = torch.FloatTensor(state).to(device)

			# Find action using policy.
			dist, value = net(state)
			action = dist.sample()
			action = action #HACK

			# Take actions and find MDP.
			next_state, reward, done, _ = envs.step(action.cpu().numpy())
			total_reward += sum(reward)
			textio.log('Steps: {} and Reward: {}'.format(int(frame_idx%20), total_reward))

			# Calculate log(policy)
			log_prob = dist.log_prob(action)
			entropy += dist.entropy().mean()

			# Create Experiences
			log_probs.append(log_prob)
			values.append(value)
			rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
			masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
			states.append(state)
			actions.append(action)
			
			# Update state.
			state = next_state
			frame_idx += 1

			# Test Trained Policy.
			if frame_idx % 40 == 0:
				print_('\n\nEvaluate Policy!', color='bl', style='bold')
				test_reward = np.mean([utils.test_env(env, net, test_idx) for test_idx in range(test_epochs)])

				# test_rewards.append(test_reward)
				# utils.plot(frame_idx, test_rewards)	# not required due to tensorboardX.
				writer.add_scalar('test_reward', test_reward, frame_idx)
				
				print_('\nTest Reward: {}\n'.format(test_reward), color='bl', style='bold')
				textio.log('Test Reward: {}'.format(test_reward))

				# Save various factors of training.
				snap = {'frame_idx': frame_idx,
						'model': net.state_dict(),
						'best_test_reward': best_test_reward,
						'optimizer' : optimizer.state_dict()}

				if best_test_reward < test_reward:
					save_checkpoint(snap, os.path.join(log_dir, 'weights_bestPolicy.pth'))
					best_test_reward = test_reward
				save_checkpoint(snap, os.path.join(log_dir,'weights.pth'))
				if test_reward > threshold_reward: early_stop = True
			if frame_idx % 1000 == 0:
				if not os.path.exists(os.path.join(log_dir, 'models')): os.mkdir(os.path.join(log_dir, 'models'))
				save_checkpoint(snap, os.path.join(log_dir, 'models', 'weights_%0.5d.pth'%frame_idx))

				
		# Calculate Returns
		next_state = torch.FloatTensor(next_state).to(device)
		_, next_value = net(next_state)
		returns = compute_gae(next_value, rewards, masks, values)

		# Concatenate experiences for multiple environments.
		returns   = torch.cat(returns).detach()
		log_probs = torch.cat(log_probs).detach()
		values    = torch.cat(values).detach()
		states    = torch.cat(states)
		actions   = torch.cat(actions)
		advantage = returns - values
		
		writer.add_scalar('reward/episode', total_reward, frame_idx)
		textio.log('Total Training Reward: {}'.format(total_reward))

		# Update the Policy.
		ppo_update(net, optimizer, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage, writer, frame_idx)