Example #1
0
    def __init__(self, env, device, model_dir, args):
        self.env = env
        self.env_name = args.env_name
        self.seed = args.seed
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.max_action = float(self.env.action_space.high[0])
        self.batch_size = args.batch_size
        self.max_timesteps = args.max_timesteps
        self.gaussian_std = args.gaussian_std
        self.start_timesteps = args.start_timesteps
        self.eval_freq = args.eval_freq
        self.rand_action_p = args.rand_action_p

        self.model_dir = os.path.join(model_dir,
                                      f"{args.env_name}_{args.seed}")

        self.algo = DDPG(self.state_dim, self.action_dim, self.max_action,
                         device)

        self.storage = ReplayBuffer(self.state_dim, self.action_dim, device)

        self.eval_rewards = []

        self.total_steps = 0
        self.episodes = 0
        self.episode_steps = 0
        self.episode_rewards = 0

        self.state = None
Example #2
0
    def __init__(self):
        self.observation_space = SA_OBS_SPACE
        self.action_space = SA_ACTION_SPACE
        # self.agent = agent
        self.agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE,
                          CA_ACTION_BOUND)
        self.dqn_solver = DQNSolver(SA_OBS_SPACE, SA_ACTION_SPACE)

        logging.basicConfig(file_name="logs/log.log",
                            format='%(asctime)s %(message)s',
                            file_mode='w+')
        self.logger = logging.getLogger()
        self.logger.setLevel(logging.DEBUG)
Example #3
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Example #4
0
def run_pusher3dof(args, sim=True, vanilla=False):
    try:
        from hyperdash import Experiment

        hyperdash_support = True
    except:
        hyperdash_support = False

    env = NormalizedEnv(gym.make(args.env))

    torques = [1.0] * 3  # if real
    colored = False

    if sim:
        torques = [args.t0, args.t1, args.t2]
        colored = True

    if not vanilla:
        env.env._init(
            torques=torques,
            colored=colored
        )

    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    nb_states = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]

    agent = DDPG(nb_states, nb_actions, args)
    evaluate = Evaluator(
        args.validate_episodes,
        args.validate_steps,
        args.output,
        max_episode_length=args.max_episode_length
    )

    exp = None

    if args.mode == 'train':
        if hyperdash_support:
            prefix = "real"
            if sim: prefix = "sim"

            exp = Experiment("s2r-pusher3dof-ddpg-{}".format(prefix))
            import socket

            exp.param("host", socket.gethostname())
            exp.param("type", prefix)  # sim or real
            exp.param("vanilla", vanilla)  # vanilla or not
            exp.param("torques", torques)
            exp.param("folder", args.output)

            for arg in ["env", "max_episode_length", "train_iter", "seed", "resume"]:
                arg_val = getattr(args, arg)
                exp.param(arg, arg_val)

        train(args, args.train_iter, agent, env, evaluate,
              args.validate_steps, args.output,
              max_episode_length=args.max_episode_length, debug=args.debug, exp=exp)

        # when done
        exp.end()

    elif args.mode == 'test':
        test(args.validate_episodes, agent, env, evaluate, args.resume,
             visualize=args.vis, debug=args.debug, load_best=args.best)

    else:
        raise RuntimeError('undefined mode {}'.format(args.mode))
Example #5
0
        "fthigh": 120,
        "fshin": 60,
        "ffoot": 30
    },
    colored=False
)

if args.seed > 0:
    np.random.seed(args.seed)
    env.seed(args.seed)

nb_states = env.observation_space.shape[0]
nb_actions = env.action_space.shape[0]


agent = DDPG(nb_states, nb_actions, args)
evaluate = Evaluator(args.validate_episodes,
    args.validate_steps, args.output, max_episode_length=args.max_episode_length)

exp = None

if args.mode == 'train':
    exp = Experiment("sim2real-ddpg-real-cheetah")
    for arg in ["env", "rate", "prate", "hidden1", "hidden2", "warmup", "discount",
                "bsize", "rmsize", "window_length", "tau", "ou_theta", "ou_sigma", "ou_mu",
                "validate_episodes", "max_episode_length", "validate_steps", "init_w",
                "train_iter", "epsilon", "seed", "resume"]:
        arg_val = getattr(args, arg)

    import socket
    exp.param("host", socket.gethostname())
Example #6
0
es = OUStrategy(env_spec=env.spec)

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_sizes=(100, 100),
    hidden_nonlinearity=tf.nn.relu,
)

algo = DDPG(env=env,
            policy=policy,
            es=es,
            qf=qf,
            batch_size=64,
            max_path_length=env.horizon,
            epoch_length=1000,
            min_pool_size=10000,
            n_epochs=args.num_epochs,
            discount=0.99,
            scale_reward=args.reward_scale,
            qf_learning_rate=1e-3,
            policy_learning_rate=1e-4,
            plot=False)

run_experiment_lite(
    algo.train(),
    log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
Example #7
0
def main():
    if sys.platform.startswith('win'):
        # Add the _win_handler function to the windows console's handler function list
        win32api.SetConsoleCtrlHandler(_win_handler, True)
    if os.path.exists(
            os.path.join(config_file.config['config_file'], 'config.yaml')):
        config = sth.load_config(config_file.config['config_file'])
    else:
        config = config_file.config
        print(f'load config from config.')

    hyper_config = config['hyper parameters']
    train_config = config['train config']
    record_config = config['record config']

    basic_dir = record_config['basic_dir']
    last_name = record_config['project_name'] + '/' \
        + record_config['remark'] \
        + record_config['run_id']
    cp_dir = record_config['checkpoint_basic_dir'] + last_name
    cp_file = cp_dir + '/rb'
    log_dir = record_config['log_basic_dir'] + last_name
    excel_dir = record_config['excel_basic_dir'] + last_name
    config_dir = record_config['config_basic_dir'] + last_name
    sth.check_or_create(basic_dir, 'basic')
    sth.check_or_create(cp_dir, 'checkpoints')
    sth.check_or_create(log_dir, 'logs(summaries)')
    sth.check_or_create(excel_dir, 'excel')
    sth.check_or_create(config_dir, 'config')

    logger = create_logger(
        name='logger',
        console_level=logging.INFO,
        console_format='%(levelname)s : %(message)s',
        logger2file=record_config['logger2file'],
        file_name=log_dir + '\log.txt',
        file_level=logging.WARNING,
        file_format=
        '%(lineno)d - %(asctime)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s'
    )
    if train_config['train']:
        sth.save_config(config_dir, config)

    if train_config['unity_mode']:
        env = UnityEnvironment()
    else:
        env = UnityEnvironment(
            file_name=train_config['unity_file'],
            no_graphics=True if train_config['train'] else False,
            base_port=train_config['port'])
    brain_name = env.external_brain_names[0]
    brain = env.brains[brain_name]
    # set the memory use proportion of GPU
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5
    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default() as g:
        with tf.Session(graph=g, config=tf_config) as sess:
            logger.info('Algorithm: {0}'.format(
                train_config['algorithm'].name))
            if train_config['algorithm'] == config_file.algorithms.ppo_sep_ac:
                from ppo.ppo_base import PPO_SEP
                model = PPO_SEP(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                logger.info('PPO_SEP initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.ppo_com:
                from ppo.ppo_base import PPO_COM
                model = PPO_COM(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                logger.info('PPO_COM initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.sac:
                from sac.sac import SAC
                model = SAC(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                logger.info('SAC initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.sac_no_v:
                from sac.sac_no_v import SAC_NO_V
                model = SAC_NO_V(sess=sess,
                                 s_dim=brain.vector_observation_space_size,
                                 a_counts=brain.vector_action_space_size[0],
                                 hyper_config=hyper_config)
                logger.info('SAC_NO_V initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.ddpg:
                from ddpg.ddpg import DDPG
                model = DDPG(sess=sess,
                             s_dim=brain.vector_observation_space_size,
                             a_counts=brain.vector_action_space_size[0],
                             hyper_config=hyper_config)
                logger.info('DDPG initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.td3:
                from td3.td3 import TD3
                model = TD3(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                logger.info('TD3 initialize success.')
            recorder = Recorder(log_dir,
                                excel_dir,
                                record_config,
                                logger,
                                max_to_keep=5,
                                pad_step_number=True,
                                graph=g)
            episode = init_or_restore(cp_dir, sess, recorder, cp_file)
            try:
                if train_config['train']:
                    train_OnPolicy(
                        sess=sess,
                        env=env,
                        brain_name=brain_name,
                        begin_episode=episode,
                        model=model,
                        recorder=recorder,
                        cp_file=cp_file,
                        hyper_config=hyper_config,
                        train_config=train_config) if not train_config[
                            'use_replay_buffer'] else train_OffPolicy(
                                sess=sess,
                                env=env,
                                brain_name=brain_name,
                                begin_episode=episode,
                                model=model,
                                recorder=recorder,
                                cp_file=cp_file,
                                hyper_config=hyper_config,
                                train_config=train_config)
                    tf.train.write_graph(g,
                                         cp_dir,
                                         'raw_graph_def.pb',
                                         as_text=False)
                    export_model(cp_dir, g)
                else:
                    inference(env, brain_name, model, train_config)
            except Exception as e:
                logger.error(e)
            finally:
                env.close()
    recorder.close()
    sys.exit()
Example #8
0
def main():
    if sys.platform.startswith('win'):
        win32api.SetConsoleCtrlHandler(_win_handler, True)

    if train_config['unity_mode']:
        env = UnityEnvironment()
    else:
        env = UnityEnvironment(
            file_name=train_config['unity_file'],
            no_graphics=True if train_config['train'] else False,
            base_port=train_config['port'])
    brain_name = env.external_brain_names[0]
    brain = env.brains[brain_name]
    # set the memory use proportion of GPU
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5
    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default() as g:
        with tf.Session(graph=g, config=tf_config) as sess:
            print('Algorithm: {0}'.format(train_config['algorithm'].name))
            if train_config['algorithm'] == algorithms.ppo_sep_ac:
                from ppo.ppo_base import PPO_SEP
                model = PPO_SEP(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                print('PPO_SEP initialize success.')
            elif train_config['algorithm'] == algorithms.ppo_com:
                from ppo.ppo_base import PPO_COM
                model = PPO_COM(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                print('PPO_COM initialize success.')
            elif train_config['algorithm'] == algorithms.sac:
                from sac.sac import SAC
                model = SAC(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                print('SAC initialize success.')
            elif train_config['algorithm'] == algorithms.sac_no_v:
                from sac.sac_no_v import SAC_NO_V
                model = SAC_NO_V(sess=sess,
                                 s_dim=brain.vector_observation_space_size,
                                 a_counts=brain.vector_action_space_size[0],
                                 hyper_config=hyper_config)
                print('SAC_NO_V initialize success.')
            elif train_config['algorithm'] == algorithms.ddpg:
                from ddpg.ddpg import DDPG
                model = DDPG(sess=sess,
                             s_dim=brain.vector_observation_space_size,
                             a_counts=brain.vector_action_space_size[0],
                             hyper_config=hyper_config)
                print('DDPG initialize success.')
            elif train_config['algorithm'] == algorithms.td3:
                from td3.td3 import TD3
                model = TD3(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                print('TD3 initialize success.')
            sess.run(tf.global_variables_initializer())
            try:
                if train_config['train']:
                    train_OnPolicy(
                        sess=sess,
                        env=env,
                        brain_name=brain_name,
                        begin_episode=0,
                        model=model,
                        hyper_config=hyper_config,
                        train_config=train_config) if not train_config[
                            'use_replay_buffer'] else train_OffPolicy(
                                sess=sess,
                                env=env,
                                brain_name=brain_name,
                                begin_episode=0,
                                model=model,
                                hyper_config=hyper_config,
                                train_config=train_config)
                else:
                    inference(env, brain_name, model, train_config)
            except Exception as e:
                print(e)
            finally:
                env.close()
    sys.exit()
Example #9
0
def run_reacher(args, sim=True):
    try:
        from hyperdash import Experiment

        hyperdash_support = True
    except:
        hyperdash_support = False

    env = NormalizedEnv(gym.make(args.env))

    torques = [200, 200]  # if real
    colors = None
    if sim:
        torques = [args.t0, args.t1]
        colors = {
            "arenaBackground": ".27 .27 .81",
            "arenaBorders": "1.0 0.8 0.4",
            "arm0": "0.9 0.6 0.9",
            "arm1": "0.9 0.9 0.6"
        }

    env.env.env._init(  # real robot
        torque0=torques[0],  # torque of joint 1
        torque1=torques[0],  # torque of joint 2
        topDown=True,
        colors=colors)

    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    nb_states = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]

    agent = DDPG(nb_states, nb_actions, args)
    evaluate = Evaluator(args.validate_episodes,
                         args.validate_steps,
                         args.output,
                         max_episode_length=args.max_episode_length)

    exp = None

    if args.mode == 'train':
        if hyperdash_support:
            prefix = "real"
            if sim: prefix = "sim"

            exp = Experiment("s2r-reacher-ddpg-{}".format(prefix))
            import socket

            exp.param("host", socket.gethostname())
            exp.param("type", prefix)  # sim or real
            exp.param("torques", [torques[0], torques[1]])
            exp.param("folder", args.output)

            for arg in [
                    "env", "max_episode_length", "train_iter", "seed", "resume"
            ]:
                arg_val = getattr(args, arg)
                exp.param(arg, arg_val)

        train(args,
              args.train_iter,
              agent,
              env,
              evaluate,
              args.validate_steps,
              args.output,
              max_episode_length=args.max_episode_length,
              debug=args.debug,
              exp=exp)

        # when done
        exp.end()

    elif args.mode == 'test':
        test(args.validate_episodes,
             agent,
             env,
             evaluate,
             args.resume,
             visualize=args.vis,
             debug=args.debug,
             load_best=args.best)

    else:
        raise RuntimeError('undefined mode {}'.format(args.mode))