コード例 #1
0
ファイル: ES_DDPG.py プロジェクト: shilx001/ERL_improvement
 def __init__(self,
              env_name='Hopper-v2',
              total_episodes=1000,
              learning_steps=1000,
              gpu=0,
              update_time=1,
              gamma=1,
              episode_length=1000,
              total_steps=int(1e6),
              lr=1e-3,
              action_bound=1,
              num_samples=10,
              noise=0.02,
              std_dev=0.03,
              batch_size=100,
              elite_percentage=0.2,
              mutate=0.9,
              crossover=0.2,
              hidden_size=64,
              seed=1,
              namescope='default'):
     self.env = gym.make(env_name)
     np.random.seed(seed)
     self.env.seed(seed)
     tf.set_random_seed(seed)
     self.input_size = self.env.observation_space.shape[0]
     self.output_size = self.env.action_space.shape[0]
     self.total_episodes = total_episodes
     self.episode_length = episode_length
     self.total_steps = total_steps
     self.update_time = update_time
     self.lr = lr
     self.gamma = gamma
     self.action_bound = action_bound
     self.num_samples = num_samples
     self.noise = noise
     self.stddev = std_dev
     self.batch_size = batch_size
     self.elite_percentage = elite_percentage
     self.mutate = mutate
     self.crossover = crossover
     self.hidden_size = hidden_size
     self.normalizer = utils.Normalizer(self.input_size)
     self.batch_size = batch_size
     self.namescope = namescope
     # config = tf.ConfigProto(device_count={'GPU': gpu})
     self.learning_steps = learning_steps
     self.td3_agent = ddpg.DDPG(self.input_size,
                                self.output_size,
                                1,
                                namescope=self.namescope,
                                hidden_size=hidden_size,
                                seed=seed)
コード例 #2
0
def run(total_eps=2, message=False, render=False, map_path="Maps/map.png",\
        model_path="save/", gif_path="out/", gif_name="test.gif"):
    if not os.path.exists(gif_path):
        os.makedirs(gif_path)
    images = []
    RL = ddpg.DDPG(
        model = [models.PolicyNet, models.QNet],
        learning_rate = [0.0001, 0.0001],
        reward_decay = 0.99,
        memory_size = 10000,
        batch_size = 64)

    RL.save_load_model("load", model_path)

    env = NavigationEnv(path=map_path)
    for eps in range(total_eps):
        step = 0
        max_success_rate = 0
        success_count = 0

        state = env.initialize()
        r_eps = []
        acc_reward = 0.
            
        while(True):
            # Choose action and run
            action = RL.choose_action(state, eval=True)
            state_next, reward, done = env.step(action)
            im = env.render(gui=render)
            im_pil = Image.fromarray(cv2.cvtColor(np.uint8(im*255),cv2.COLOR_BGR2RGB))
            images.append(im_pil)

            # Record and print information
            r_eps.append(reward)
            acc_reward += reward
            
            if message:
                print('\rEps: {:2d}| Step: {:4d} | action:{:+.2f}| R:{:+.2f}| Reps:{:.2f}  '\
                        .format(eps, step, action[0], reward, acc_reward), end='')
            
            state = state_next.copy()
            step += 1
            if done or step>600:
                if message:
                    print()
                break

    print("Save evaluation GIF ...")
    images[0].save(gif_path+gif_name,
        save_all=True, append_images=images[1:], optimize=True, duration=40, loop=0)
コード例 #3
0
ファイル: train.py プロジェクト: Gouet/DDPG_pytorch
def main(arglist):
    env = gym.make(arglist.scenario)
    writer = SummaryWriter(log_dir='./logs/')

    critic = agent.Critic(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
    actor = agent.Actor(env.observation_space.shape[0], 2).to(device)
    target_critic = agent.Critic(env.observation_space.shape[0], env.action_space.shape[0], arglist.tau).to(device)
    target_actor = agent.Actor(env.observation_space.shape[0], 2, arglist.tau).to(device)
    
    actor.eval()
    critic.eval()
    target_actor.eval()
    target_critic.eval()

    ddpg_algo = ddpg.DDPG(actor, critic, target_actor, target_critic, arglist.gamma, arglist.batch_size, arglist.eval)
    ddpg_algo.load('./saved/actor_' + str(arglist.load_episode_saved), './saved/critic_' + str(arglist.load_episode_saved))

    for episode in range(arglist.max_episode):
        obs = env.reset()
        done = False
        j = 0
        ep_ave_max_q_value = 0
        total_reward = 0
        while not done:
            if not arglist.eval:
                env.render()
            
            action = ddpg_algo.act(obs)

            obs2, reward, done, info = env.step(action)
            total_reward += reward

            if arglist.eval:
                ep_ave_max_q_value += ddpg_algo.train(action, [reward], obs, obs2, [done])
            obs = obs2
            j += 1

        if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0:
            critic.save_model('./saved/critic_' + str(episode))
            actor.save_model('./saved/actor_' + str(episode))

        if arglist.eval:
            print('average_max_q: ', ep_ave_max_q_value / float(j), 'reward: ', total_reward, 'episode:', episode)
            writer.add_scalar('Average_max_q', ep_ave_max_q_value / float(j), episode)
            writer.add_scalar('Reward', total_reward, episode)

    env.close()
コード例 #4
0
ファイル: ARS_DDPG.py プロジェクト: shilx001/ERL_improvement
 def __init__(self,
              env_name='Hopper-v2',
              total_episodes=1000,
              action_bound=1,
              episode_length=1000,
              learning_rate=0.02,
              weight=0.01,
              learning_steps=100,
              num_samples=8,
              noise=0.02,
              bc_index=[],
              std_dev=0.03,
              syn_step=1,
              meta_population_size=5,
              seed=1,
              hidden_size=300):
     self.env = gym.make(env_name)
     np.random.seed(seed)
     self.env.seed(seed)
     self.action_bound = action_bound
     self.input_size = self.env.observation_space.shape[0]
     self.output_size = self.env.action_space.shape[0]
     self.total_episodes = total_episodes
     self.episode_length = episode_length
     self.lr = learning_rate
     self.num_samples = num_samples
     self.noise = noise
     self.meta_population_size = meta_population_size
     self.seed = seed
     self.syn_step = syn_step
     self.learning_steps = learning_steps
     self.bc_index = bc_index
     self.weight = weight
     self.normalizer = utils.Normalizer(self.env.observation_space.shape[0])
     self.hidden_size = hidden_size
     self.stddev = std_dev
     self.td3_agent = ddpg.DDPG(self.input_size,
                                self.output_size,
                                1,
                                hidden_size=self.hidden_size,
                                seed=seed)
     self.num_best_deltas = 4
コード例 #5
0
    def create_agents(self, env, arglist):
        #workers = []
        algo_mode = self._algo_mode_from_agents(env)

        obs_shapes = [env.get_env().observation_space[i].shape for i in range(env.get_env().n)]
        actions_shape_n = [env.get_env().action_space[i].n for i in range(env.get_env().n)]
        actions_n = 0
        obs_shape_n = 0
        for actions in actions_shape_n:
            actions_n += actions
        for obs_shape in obs_shapes:
            obs_shape_n += obs_shape[0]

        for i, action_space, observation_space, algo in zip(range(len(env.get_env().action_space)), env.get_env().action_space, env.get_env().observation_space, algo_mode):
            if isinstance(action_space, Box):
                discrete_action = False
            else:
                discrete_action = True

            if algo == ddpg.MADDPG:
                print('MADDPG load.')
                critic = agent.Critic(obs_shape_n, actions_n).to(device)
                actor = agent.Actor(observation_space.shape[0], action_space.n).to(device)
                target_critic = agent.Critic(obs_shape_n, actions_n, arglist.tau).to(device)
                target_actor = agent.Actor(observation_space.shape[0], action_space.n, arglist.tau).to(device)
            else:
                print('DDPG load.')
                critic = agent.Critic(observation_space.shape[0], action_space.n).to(device)
                actor = agent.Actor(observation_space.shape[0], action_space.n).to(device)
                target_critic = agent.Critic(observation_space.shape[0], action_space.n, arglist.tau).to(device)
                target_actor = agent.Actor(observation_space.shape[0], action_space.n, arglist.tau).to(device)

            actor.eval()
            critic.eval()
            target_actor.eval()
            target_critic.eval()

            ddpg_algo = ddpg.DDPG(i, actor, critic, target_actor, target_critic, arglist.gamma, arglist.batch_size, arglist.eval, discrete_action, alg_mode=algo)
            ddpg_algo.load('./saved/actor' + str(i) + '_' + str(arglist.load_episode_saved), './saved/critic' + str(i) + '_' + str(arglist.load_episode_saved))

            self.workers.append(ddpg_algo)
コード例 #6
0
    def __init__(self, numChans, states, numSteps):
        self.actions = np.zeros((numChans + 1, numChans))
        for k in range(0, numChans):
            self.actions[k + 1, k] = 1
        self.numChans = numChans
        self.numActions = np.shape(self.actions)[0]
        self.actionTally = np.zeros(numChans + 1)
        self.actionHist = np.zeros((numSteps, numChans))
        self.actionHistInd = np.zeros(numSteps)

        self.goodChans = np.ones(numChans)

        self.states = states
        self.numStates = np.shape(states)[0]

        self.stateHist = np.zeros((numSteps, numChans))
        self.stateTally = np.zeros(self.numStates)

        self.rewardHist = np.zeros(numSteps)
        self.rewardTally = np.zeros(numChans + 1)
        self.cumulativeReward = np.zeros(numSteps)
        self.rewardTrans = np.zeros(
            (self.numActions, self.numStates, self.numStates))

        self.exploreHist = []

        self.policy = np.zeros(numChans)

        self.n_actions = numChans + 1
        self.n_features = numChans

        self.ddpg_ = ddpg.DDPG(self, self.n_actions, self.n_features,
                               self.n_actions + 1)

        self.type = "ddpg"
        self.hyperType = "learning"

        self.var = 1
コード例 #7
0
    def __init__(self):

        # Create the environment and render
        self._env = Env(True)
        # Extract the dimesions of states and actions
        observation_dim = self._env.observation_space.low.size
        action_dim = self._env.action_space.low.size

        self._device = 'cpu'
        # Uncomment if you do trianing on the GPU
        # self._device = 'cuda:0'

        hidden_sizes = [256] * 2
        self._q_net = networks.QvalueNetwork(
            hidden_sizes=hidden_sizes,
            input_size=observation_dim + action_dim).to(device=self._device)
        self._target_q_net = networks.QvalueNetwork(
            hidden_sizes=hidden_sizes,
            input_size=observation_dim + action_dim).to(device=self._device)
        self._policy_net = networks.PolicyNetwork(
            hidden_sizes=hidden_sizes,
            input_size=observation_dim,
            output_size=action_dim).to(device=self._device)
        self._target_policy_net = networks.PolicyNetwork(
            hidden_sizes=hidden_sizes,
            input_size=observation_dim,
            output_size=action_dim).to(device=self._device)
        # Target update rate
        tau = 0.001

        # Set to true if you want to slow down the simulator
        self._slow_simulation = False

        # Create the ddpg agent
        self.agent = ddpg.DDPG(q_net=self._q_net,
                               target_q_net=self._target_q_net,
                               policy_net=self._policy_net,
                               target_policy_net=self._target_policy_net,
                               tau=tau,
                               device=self._device)

        # Create a replay buffer - we use here one from a popular framework
        self._replay = replay.SimpleReplayBuffer(
            max_replay_buffer_size=1000000,
            observation_dim=observation_dim,
            action_dim=action_dim,
            env_info_sizes={},
        )

        # Stores the cummulative rewards
        self._rewards = []
        self._rewards_test = []

        # The following logging works only on linux at the moment - might cause issues if you use windows

        folder = 'experiment_data_test_runs'
        #generate random hash string - unique identifier if we start
        # multiple experiments at the same time
        rand_id = hashlib.md5(os.urandom(128)).hexdigest()[:8]
        self._file_path = './' + folder + '/' + time.ctime().replace(
            ' ', '_') + '__' + rand_id

        # Create experiment folder
        if not os.path.exists(self._file_path):
            os.makedirs(self._file_path)
コード例 #8
0
ファイル: ddpg-example.py プロジェクト: henryjon/my-rl
import gym

import ddpg

test = False

ddpg_alg = ddpg.DDPG(
    env_fn=lambda: gym.make("HalfCheetah-v2"),
    pi_hidden_sizes=(64, ),
    q_hidden_sizes=(64, ),
    gamma=0.99,
    rho=0.995,
    action_noise=0.1,
    replay_buffer_size=3 if test else 1_000_000,
    log_dir="/Users/harrygiles/tmp/my-rl/ddpg",
    exploration_period=10_000,
)

ddpg_alg.train(
    epochs=2 if test else 50,
    epoch_size=10 if test else 5_000,
    batch_size=10 if test else 100,
    max_episode_length=1_000,
)
コード例 #9
0
from nav_environment import NavigationEnv
import ddpg
import models
import numpy as np
import os
import eval_ddpg

batch_size = 64
eval_eps = 50
RL = ddpg.DDPG(
    model = [models.PolicyNet, models.QNet],
    learning_rate = [0.0001, 0.0001],
    reward_decay = 0.99,
    memory_size = 10000,
    batch_size = batch_size)

is_train = True
render = True
load_model = False
'''
is_train = False
render = True
load_model = True
'''
gif_path = "out/"
model_path = "save/"
if not os.path.exists(model_path):
    os.makedirs(model_path)

if load_model:
    print("Load model ...", model_path)
コード例 #10
0
import matplotlib.pyplot as plt
import json
import cv2
import models

#%%
env = GSlamContBot2DWrapper.Bot2DEnv(obs_size=128, 
                            grid_size=3, 
                            map_path="Image/map9.png",
                            task="Exploration")
memory_size = 1000
RL = ddpg.DDPG(
    actor_net = models.ActorExp,
    critic_net = models.CriticExp,
    n_actions = 2,
    learning_rate = [0.0001, 0.0002],
    reward_decay = 0.95,
    memory_size = memory_size,
    batch_size = 64,
    var = 2,
    var_decay = 0.9999,)

#%%
seq_size = 3
if __name__ == '__main__':
    total_step = 0
    reward_rec= []

    for eps in range(1000):
        state = env.reset()
        state_m = cv2.resize(state["map"], (64,64), interpolation=cv2.INTER_LINEAR)
        state_m = np.tile(np.expand_dims(state_m,-1),(1,1,seq_size))
コード例 #11
0
if __name__ == "__main__":

    env = gym.make(ENV)

    with tf.Session() as sess:
        training = None
        if "-n" in sys.argv:
            training = True
        else:
            training = False

        actor = ddpg.DDPG(3,
                          1,
                          memory=0.99,
                          actor_lr=0.001,
                          critic_lr=0.001,
                          tau=0.1,
                          exp_batch=1024,
                          training=training)

        saver = tf.train.Saver()
        if "-n" in sys.argv:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, "model/pendelum")
            print("Restored...")

        try:
            if "-p" in sys.argv:
                print("Playing...")
                gym_wrapper.play(env, actor, a_mod=action_modifier)
コード例 #12
0
ファイル: run.py プロジェクト: jingw2/policy_gradient
def train_PG(
    exp_name, 
    env_name, 
    n_iters, 
    gamma, 
    min_timesteps_per_batch, 
    max_path_length, 
    lr, 
    normalize_advantages, 
    nn_baseline,
    seed, 
    n_layers,
    hidden_size,
    discrete,
    logdir,
    method,
    method_args):

    start = time.time()

    # env
    # env = gym.make(env_name)
    #TODO:
    env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \
        timeout=5, realworkercount=4)
    env.state_size = 1
    env.action_size = 2

    # set up logger
    setup_logger(logdir, locals())

    # random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    if hasattr(env, 'seed'):
        env.seed(seed)

    # sete attributes
    if isinstance(env, gym.Env):
        max_path_length = max_path_length or env.spec.max_episode_steps
        discrete = isinstance(env.action_space, gym.spaces.Discrete)
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.n if discrete else env.action_space.shape[0]
    else:
        if hasattr(env, 'state_size'):
            state_size = env.state_size
        else:
            raise Exception("Environment has attribute state_size or use gym.Env!")
        if hasattr(env, 'action_size'):
            action_size = env.action_size
        else:
            raise Exception("Environment has attribute action_size or use gym.Env!")
    
    net_args = {
        "n_layers": n_layers,
        "state_size": state_size,
        "action_size": action_size,
        "discrete": discrete,
        "hidden_size": hidden_size,
        "learing_rate": lr,
        "output_activation": None
    }

    trajectory_args = {
        "max_path_length": max_path_length,
        "min_timesteps_per_batch": min_timesteps_per_batch
    }

    reward_args = {
        "gamma": gamma,
        "nn_baseline": nn_baseline,
        "normalize_advantage": normalize_advantages
    }

    if method == "sac":
        agent = sac.SAC(net_args, trajectory_args, reward_args, method_args)
    elif method == "ddpg":
        agent = ddpg.DDPG(net_args, trajectory_args, reward_args, method_args)
    elif method == "vpg":
        agent = Agent(net_args, trajectory_args, reward_args)

    # create networks 
    agent.build_net()

    total_timesteps = 0
    for it in range(n_iters):
        print("=============Iteration {}==============".format(it))
        paths, timesteps_this_batch = agent.sample_trajectories(it, env)
        #TODO:
        env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \
            timeout=5, realworkercount=4)
        total_timesteps += timesteps_this_batch

        states = np.concatenate([path["state"] for path in paths])
        actions = np.concatenate([path["action"] for path in paths])
        rewards = [path["reward"] for path in paths]
        # next_states = np.concatenate([path["next_state"] for path in paths])

        states_input = torch.Tensor(states).float()
        actions_input = torch.Tensor(actions).float()
        if method == "vpg":
            q_n, adv = agent.estimate_return(states_input, rewards)
            agent.train_op(states_input, actions_input, q_n, adv)
        else:
            agent.train_op()

        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]

        best_idx = np.argmax(returns)
        best_path = paths[best_idx]
        best_policy = {}
        for i in range(5):
            best_policy[str(i+1)] = best_path["action"][i].tolist()
        data = {"method": method, "best_policy": [best_policy], "best_reward": returns[best_idx]}
        data = pd.DataFrame(data)
        if os.path.exists("best_policy_pg.csv"):
            policy_df = pd.read_csv("best_policy_pg.csv")
            policy_df.loc[len(policy_df)] = [method, best_policy, returns[best_idx]]
        else:
            policy_df = data
        policy_df.to_csv("best_policy_pg.csv", index=False)

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", it)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
コード例 #13
0
    epochs = 1000
    steps = 1000
    updateTargetNetwork = 5000
    explorationRate = 0
    minibatch_size = 32
    learnStart = 32
    learningRate = 0.0001
    discountFactor = 0.99
    memorySize = 1000000
    network_inputs = 12
    network_outputs = 3
    network_structure = [30, 30]
    current_epoch = 0
    max_margin = 2

    ddpg = ddpg.DDPG(network_inputs, network_outputs, memorySize, learningRate, discountFactor, learnStart, max_margin)
    ddpg.init_net_works(network_structure)

    restart_cnt = 0
    stepCounter = 0

    for epoch in range(1, epochs + 1):
        restart_cnt += 1
        random_des = random.randint(0, 3)
        env.env.set_des(des_list[random_des])
        print ('set des: ' + str(des_list[random_des]))
        observation = env.reset()

        done = False
        episode_step = 0
コード例 #14
0
ファイル: main.py プロジェクト: sdoloris/rl_robotic_control
if __name__ == '__main__':
	env = gym.make(env_name)
	env.seed(0)
	random.seed(0)
	np.random.seed(0)
	
	# Make a directory to store the learned policies
	dirname = datetime.datetime.now().isoformat()
	os.mkdir(dirname)
	
	replay_buffer = replay_buffer.ReplayBuffer(buffer_size)
	sample_batch = replay_buffer.get_batch
	
	
	ddpg = ddpg.DDPG(env, replay_buffer, sample_batch, train_iter, gamma, tau,
		batch_size, n_train, n_episode)
		
		
	for epoch in range(n_epoch):
		print("Start training epoch", epoch)
		for cycle in range(n_cycles):
			for episode in range(n_episode):
				state = env.reset()
				state = np.concatenate((state['observation'], state['achieved_goal'], state['desired_goal']))
				tot_reward = 0
				ddpg.reset_noise()
				for step in range(env.spec.timestep_limit):
					if random.random() < 0.2:
						action = env.action_space.sample()
					else: action = ddpg.noise_action(state)
					obs, reward, done, info = env.step(action)