def start(args, counter=None):

    # Initialize the actor, critic and difference networks
    conf = open_config_file(args)
    if "learning_rate" in conf['experiment']:
        critic_learning_rate = conf["experiment"]["learning_rate"]
        actor_learning_rate = critic_learning_rate / 10

    if "learning_rate" in conf['experiment']:
        counter = conf["experiment"]["difference_model"]
    print actor_learning_rate, critic_learning_rate

    with tf.Graph().as_default() as ddpg:
        actor = ActorNetwork(OBSERVATION_DIMS, ACTION_DIMS, 1,
                             actor_learning_rate, TAU)
        critic = CriticNetwork(OBSERVATION_DIMS, ACTION_DIMS,
                               critic_learning_rate, TAU,
                               actor.get_num_trainable_vars())
    if counter:
        with tf.Graph().as_default() as diff_model:
            model = DifferenceModel(STATE_DIMS + ACTION_DIMS, STATE_DIMS)
            train(args,
                  ddpg,
                  actor,
                  critic,
                  counter=counter,
                  diff_model=diff_model,
                  model=model)
    else:
        train(args, ddpg, actor, critic)
Ejemplo n.º 2
0
def main(_):
    with tf.Session() as sess:
        env = gym.make(ENV_NAME)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        print(env.observation_space)
        print(env.action_space)

        state_dim = env.observation_space.shape[0]

        try:
            action_dim = env.action_space.shape[0]
            action_bound = env.action_space.high
            # Ensure action bound is symmetric
            assert (env.action_space.high == -env.action_space.low)
            discrete = False
            print('Continuous Action Space')
        except AttributeError:
            action_dim = env.action_space.n
            action_bound = 1
            discrete = True
            print('Discrete Action Space')

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars())

        noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        reward = Reward(REWARD_FACTOR, GAMMA)
Ejemplo n.º 3
0
def main(_):

    with tf.compat.v1.Session() as sess:
        env = StageWorld(LASER_BEAM, map_type)
        np.random.seed(RANDOM_SEED)
        tf.compat.v1.set_random_seed(RANDOM_SEED)

        state_dim = LASER_BEAM * LASER_HIST + SPEED + TARGET

        action_dim = ACTION
        #action_bound = [0.25, np.pi/6] #bounded acceleration
        action_bound = [0.5, np.pi / 3]  #bounded velocity
        switch_dim = SWITCH

        discrete = False
        print('Continuous Action Space')

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim, switch_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        reward = Reward(REWARD_FACTOR, GAMMA)

        try:
            train(sess, env, actor, critic, noise, reward, discrete,
                  action_bound)
        except KeyboardInterrupt:
            pass
Ejemplo n.º 4
0
def main(_):
    with tf.Session() as sess:

        env = gym.make(ENV_NAME)
        # np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high
        # Ensure action bound is symmetric
        assert (env.action_space.high == -env.action_space.low)

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim, action_bound,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        if GYM_MONITOR_EN:
            if not RENDER_ENV:
                env = Monitor(env,
                              MONITOR_DIR,
                              video_callable=False,
                              force=True)
            else:
                env = Monitor(env, MONITOR_DIR, force=True)

        train(sess, env, actor, critic)
Ejemplo n.º 5
0
def main(_):
    with tf.Session() as sess:
        env = gym.make(ENV_NAME)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        print(env.observation_space)
        print(env.action_space)

        state_dim = env.observation_space.shape[0]

        try:
            action_dim = env.action_space.shape[0]
            action_bound = env.action_space.high
            # Ensure action bound is symmetric
            assert (env.action_space.high == -env.action_space.low)
            discrete = False
            print('Continuous Action Space')
        except:  #原来的对象抛出处理不了这里的异常,此处更换为全部处理
            action_dim = env.action_space.n
            action_bound = 1
            discrete = True
            print('Discrete Action Space')

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        reward = Reward(REWARD_FACTOR, GAMMA)

        if GYM_MONITOR_EN:
            if not RENDER_ENV:
                gym.wrappers.Monitor(env,
                                     MONITOR_DIR,
                                     video_callable=False,
                                     force=True)  #此处更换为新版本
            # env.monitor.start(MONITOR_DIR, video_callable=False, force=True)
            else:
                gym.wrappers.Monitor(env, force=True)  #此处更换为新版本
            # env.monitor.start(MONITOR_DIR, force=True)

        try:
            train(sess, env, actor, critic, noise, reward, discrete)
        except KeyboardInterrupt:
            pass

        if GYM_MONITOR_EN:
            env.monitor.close()
Ejemplo n.º 6
0
    def __init__(self,
                 exp_dir,
                 exp_thresh,
                 state_dim,
                 action_dim,
                 learn_rate,
                 betas,
                 _device,
                 _gamma,
                 load_weights=False):
        """
            exp_dir : directory containing the expert episodes
         exp_thresh : parameter to control number of episodes to load 
                      as expert based on returns (lower means more episodes)
          state_dim : dimesnion of state 
         action_dim : dimesnion of action
         learn_rate : learning rate for optimizer 
            _device : GPU or cpu
            _gamma  : discount factor
     _load_weights  : load weights from directory
        """

        # storing runtime device
        self.device = _device

        # discount factor
        self.gamma = _gamma

        # Expert trajectory
        self.expert = ExpertTrajectories(exp_dir, exp_thresh, gamma=self.gamma)

        # Defining the actor and its optimizer
        self.actor = ActorNetwork(state_dim).to(self.device)
        self.optim_actor = torch.optim.Adam(self.actor.parameters(),
                                            lr=learn_rate,
                                            betas=betas)

        # Defining the discriminator and its optimizer
        self.disc = Discriminator(state_dim, action_dim).to(self.device)
        self.optim_disc = torch.optim.Adam(self.disc.parameters(),
                                           lr=learn_rate,
                                           betas=betas)

        if not load_weights:
            self.actor.apply(init_weights)
            self.disc.apply(init_weights)
        else:
            self.load()

        # Loss function crtiterion
        self.criterion = torch.nn.BCELoss()
Ejemplo n.º 7
0
 def __init__(self, env, sess):
     self.env = env
     self.sess = sess
     self.memory_buffer = ReplayMemory(BUFFER_SIZE, 20000, BATCH_SIZE,
                                       env.observation_space.shape,
                                       env.action_space.shape)
     self.learning_rate = LR
     self.tau = TAU
     self.buffer_size = BUFFER_SIZE
     self.batch_size = BATCH_SIZE
     self.discount = 0.99
     self.states_ph = tf.placeholder(
         tf.float32, shape=((None, ) + self.env.observation_space.shape))
     self.actions_ph = tf.placeholder(tf.float32,
                                      shape=((None, ) +
                                             self.env.action_space.shape))
     self.is_training_ph = tf.placeholder_with_default(True, shape=None)
     self.Actor = ActorNetwork(env=self.env,
                               states=self.states_ph,
                               LR=self.learning_rate,
                               TAU=self.tau,
                               discount=self.discount,
                               scope="actor_main",
                               batch_size=self.batch_size,
                               is_training=self.is_training_ph)
     self.Critic = CriticNetwork(env=self.env,
                                 states=self.states_ph,
                                 actions=self.actions_ph,
                                 LR=self.learning_rate,
                                 TAU=self.tau,
                                 discount=self.discount,
                                 scope="critic_main",
                                 batch_size=self.batch_size,
                                 is_training=self.is_training_ph)
     self.Actor_target = ActorNetwork(env=self.env,
                                      states=self.states_ph,
                                      LR=0.001,
                                      TAU=self.tau,
                                      discount=self.discount,
                                      scope="actor_target",
                                      batch_size=self.batch_size,
                                      is_training=self.is_training_ph)
     self.Critic_target = CriticNetwork(env=self.env,
                                        states=self.states_ph,
                                        actions=self.actions_ph,
                                        LR=self.learning_rate,
                                        TAU=self.tau,
                                        discount=self.discount,
                                        scope="critic_target",
                                        batch_size=self.batch_size,
                                        is_training=self.is_training_ph)
Ejemplo n.º 8
0
def main(_):
    with tf.Session() as sess:
        env = gym.make(ENV_NAME)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        print env.observation_space
        print env.action_space

        state_dim = env.observation_space.shape[0]

        try:
            action_dim = env.action_space.shape[0]
            action_bound = env.action_space.high
            # Ensure action bound is symmetric
            assert (env.action_space.high == -env.action_space.low)
            discrete = False
            print "Continuous Action Space"
        except AttributeError:
            action_dim = env.action_space.n
            action_bound = 1
            discrete = True
            print "Discrete Action Space"

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)
        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        reward = Reward(REWARD_FACTOR, GAMMA)

        if GYM_MONITOR_EN:
            if not RENDER_ENV:
                env = wrappers.Monitor(env, MONITOR_DIR, force=True)
            else:
                env = wrappers.Monitor(env, MONITOR_DIR, force=True)

        try:
            train(sess, env, actor, critic, noise, reward, discrete)
        except KeyboardInterrupt:
            pass

        #if GYM_MONITOR_EN:
        #env.monitor.close()
        env.close()

    gym.upload(MONITOR_DIR, api_key="sk_JObiOSHpRjw48FpWvI1GA")
Ejemplo n.º 9
0
 def __init__(self, env, args):
     self.env = env
     self.memory_buffer = ReplayBuffer(args.buffer_size)
     self.learning_rate_actor = args.lr_actor
     self.learning_rate_critic = args.lr_critic
     self.tau = args.TAU
     self.batch_size = args.batch_size
     self.discount = args.discount
     self.states_ph = tf.placeholder(tf.float32, shape=(None, 1))
     self.actions_ph = tf.placeholder(tf.float32,
                                      shape=((None, ) +
                                             self.env.action_space.shape))
     self.is_training_ph = tf.placeholder_with_default(True, shape=None)
     self.Actor = ActorNetwork(env=self.env,
                               states=self.states_ph,
                               LR=self.learning_rate_actor,
                               TAU=self.tau,
                               discount=self.discount,
                               scope="actor_main",
                               batch_size=self.batch_size,
                               is_training=self.is_training_ph)
     self.Critic = CriticNetwork(env=self.env,
                                 states=self.states_ph,
                                 actions=self.actions_ph,
                                 LR=self.learning_rate_critic,
                                 TAU=self.tau,
                                 discount=self.discount,
                                 scope="critic_main",
                                 batch_size=self.batch_size,
                                 is_training=self.is_training_ph)
     self.Actor_target = ActorNetwork(env=self.env,
                                      states=self.states_ph,
                                      LR=self.learning_rate_actor,
                                      TAU=self.tau,
                                      discount=self.discount,
                                      scope="actor_target",
                                      batch_size=self.batch_size,
                                      is_training=self.is_training_ph)
     self.Critic_target = CriticNetwork(env=self.env,
                                        states=self.states_ph,
                                        actions=self.actions_ph,
                                        LR=self.learning_rate_critic,
                                        TAU=self.tau,
                                        discount=self.discount,
                                        scope="critic_target",
                                        batch_size=self.batch_size,
                                        is_training=self.is_training_ph)
Ejemplo n.º 10
0
def main(args):
    now = datetime.utcnow().strftime("%b_%d_%H_%M_%S")
    monitor_dir = os.path.join('videos', args['env'],
                               "no-of-update_" + args["no_of_updates"],
                               "random_seed" + str(args["random_seed"]))
    logger = Logger(logname=args['env'], args=args, now=now)
    with tf.Session() as sess:
        env = gym.make(args['env'])
        monitor_env = gym.make(args['env'])
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))
        monitor_env.seed(int(args['random_seed']))

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high

        print("****** state dimension", state_dim)
        print("****** actions dimension", action_dim)
        print("****** actions high bound", action_bound)

        # Ensure action bound is symmetric
        assert (np.array_equal(env.action_space.high, -env.action_space.low))

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             float(args['actor_lr']), float(args['tau']),
                             int(args['minibatch_size']))

        critic = CriticNetwork(sess, state_dim, action_dim,
                               float(args['critic_lr']), float(args['tau']),
                               float(args['gamma']),
                               actor.get_num_trainable_vars())

        actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

        if args['use_gym_monitor']:
            monitor_env = wrappers.Monitor(monitor_env,
                                           monitor_dir,
                                           force=True)

        train(sess, env, args, actor, critic, actor_noise, logger, monitor_env)
        logger.close()
        if args['use_gym_monitor']:
            env.monitor.close()
            monitor_env.monitor.close()
Ejemplo n.º 11
0
def main(_):
    t1 = time.time()
    # Training the model
    with tf.Session() as sess:

        env = PowerSystem()
        # System Info
        state_dim = 20  # We only consider the Current of all line as state at this moment
        action_dim = 4  # The number of generators
        action_bound = np.array([[-1, 1], [-0.675, 0.675]])

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        saver = tf.train.Saver()

        noise = Noise(DELTA, SIGMA, OU_A, OU_MU)

        # Training the model
        train(sess, env, actor, critic, noise, action_bound)

        # # save the variables
        save_path = saver.save(sess, model_path)
        # print("[+] Model saved in file: %s" % save_path)

    # # Testing the model
    # with tf.Session() as sess:
    #
    #     env = PowerSystem()
    #     # System Info
    #     state_dim = 11  # We only consider the Current of all line as state at this moment
    #     action_dim = 2  # The number of generators
    #     action_bound = np.array([[-1, 1], [-0.675, 0.675]])
    #
    #     actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU)
    #     saver = tf.train.Saver()
    #     load_path = saver.restore(sess, model_path)
    #     test(env, actor)
    print('Running time: {} minutes.'.format((time.time() - t1) / 60))
Ejemplo n.º 12
0
    def __init__(self, state_size, batch_size, is_eval=False):
        self.state_size = state_size  # normalized previous days
        self.action_size = 3  # sit, buy, sell
        self.memory_size = 1000000  # Replay memory size
        self.batch_size = batch_size
        self.replay_memory = ExperienceReplay(self.memory_size,
                                              self.batch_size)
        self.inventory = []
        self.is_eval = is_eval  # Whether or not Training is ongoing
        self.gamma = 0.99  # Discount factor in Bellman equation

        # Actor Policy model mapping states to actions
        self.actor = ActorNetwork(
            self.state_size,
            self.action_size)  # Instantiates the Actor networks

        # Critic(Value) Model that maps state action pairs to Q values.

        self.critic = CriticNetwork(
            self.state_size, self.action_size)  # Instantiate the critic model
Ejemplo n.º 13
0
def run():
   """Build networks, create environment and train agent."""

   # Generate a Torcs environment
   env = TorcsEnv(vision=False, throttle=True, gear_change=False)

   with tf.Session() as sess:
  
      np.random.seed(args['seed'])
      tf.set_random_seed(args['seed'])
      
      # Actor and actor target
      n_params = 0
      actor = ActorNetwork(sess=sess, scope='actor_net', state_size=args['state_size'], 
                           action_size=args['action_size'], batch_size=args['batch_size'], 
                           lr=args['actor_lr'], n_params=n_params)
      n_params += actor.get_num_params()
      actor_target = ActorNetwork(sess=sess, scope='actor_net_target', state_size=args['state_size'], 
                           action_size=args['action_size'], batch_size=args['batch_size'], 
                           lr=args['actor_lr'], n_params=n_params)
      
      # Critic and critic target
      n_params += actor_target.get_num_params()
      critic = CriticNetwork(sess=sess, scope='critic_net', state_size=args['state_size'], 
                           action_size=args['action_size'], lr=args['critic_lr'], n_params=n_params)
      n_params += critic.get_num_params()
      critic_target = CriticNetwork(sess=sess, scope='critic_net_target', state_size=args['state_size'], 
                           action_size=args['action_size'], lr=args['critic_lr'], n_params=n_params)
      
      # Restore network params
      saver = tf.train.Saver()
      saver.restore(sess, os.path.join(os.path.join(args['resources'], "network"), args['file']+'_model'))

      # Train DDPG on Torcs
      test(sess, env, actor, actor_target, critic, critic_target)
Ejemplo n.º 14
0
    def __init__(self,
                 sess,
                 is_train,
                 dim_state,
                 dim_action,
                 num_paths,
                 actor_learn_rate,
                 critic_learn_rate,
                 tau,
                 buffer_size,
                 mini_batch,
                 ep_begin,
                 epsilon_end,
                 gamma,
                 max_epoch,
                 seed=66):
        self.__is_train = is_train
        self.__dim_state = dim_state
        self.__dim_action = dim_action
        self.__mini_batch = mini_batch
        self.__ep_begin = ep_begin
        self.__gamma = gamma
        self.__max_epoch = max_epoch

        self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0,
                                    actor_learn_rate, tau, num_paths)
        self.__critic = CriticNetwork(sess, dim_state, dim_action,
                                      critic_learn_rate, tau)

        self.__replay = ReplayBuffer(buffer_size, seed)

        self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch,
                                   dim_action, num_paths, seed)

        self.__state_curt = np.zeros(dim_state)
        self.__action_curt = self.__explorer.convert_action(
            np.ones(dim_action))

        self.__episode = 0
        self.__step = 0
def start(env, **config):

    # Initialize the actor, critic and difference networks
    with tf.Graph().as_default() as ddpg:

        # setup random number generators for predicatbility
        print("Random seed ", config['seed'])
        random.seed(config['seed'])
        np.random.seed(random.randint(0, 10000))
        tf.set_random_seed(random.randint(0, 10000))
        env.seed(random.randint(0, 10000))

        obs_dim = env.observation_space.shape[-1]
        act_dim = env.action_space.shape[-1]

        actor = ActorNetwork(obs_dim, act_dim, 1, config)
        critic = CriticNetwork(obs_dim, act_dim, config,
                               actor.get_num_trainable_vars())

        if config["tensorboard"] == True:
            dir_path = os.path.dirname(os.path.realpath(__file__))
            tf.summary.FileWriter(dir_path, ddpg)

    with tf.Graph().as_default() as balancing_graph:
        balancing_actor = ActorNetwork(obs_dim, act_dim, 1, config)
        balancing_critic = CriticNetwork(
            obs_dim, act_dim, config, balancing_actor.get_num_trainable_vars())

    print(actor.target_inputs.graph is tf.get_default_graph())
    print(balancing_actor.target_inputs.graph is tf.get_default_graph())
    print(balancing_actor.target_inputs.graph is actor.target_inputs.graph)

    train(env, ddpg, actor, critic, balancing_graph, balancing_actor, **config)
def start(args, counter=None):

    # Initialize the actor, critic and difference networks

    with tf.Graph().as_default() as ddpg:
        actor = ActorNetwork(OBSERVATION_DIMS, ACTION_DIMS, 1,
                             global_params.actor_learning_rate, TAU)
        critic = CriticNetwork(OBSERVATION_DIMS, ACTION_DIMS,
                               global_params.critic_learning_rate, TAU,
                               actor.get_num_trainable_vars())
    if counter:
        with tf.Graph().as_default() as diff_model:
            model = DifferenceModel(STATE_DIMS + ACTION_DIMS, STATE_DIMS)
            train(args,
                  ddpg,
                  actor,
                  critic,
                  counter=counter,
                  diff_model=diff_model,
                  model=model)
    else:
        train(args, ddpg, actor, critic)
Ejemplo n.º 17
0
def main(_):
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    # with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    with tf.Session() as sess:
        env = StageWorld(LASER_BEAM)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)

        state_dim = LASER_BEAM * LASER_HIST + SPEED + TARGET

        action_dim = ACTION
        action_bound = [0.5, np.pi / 3]
        switch_dim = SWITCH

        discrete = False
        print('Continuous Action Space')
        with tf.name_scope("Actor"):
            actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                                 ACTOR_LEARNING_RATE, TAU)
        with tf.name_scope("Critic"):
            critic = CriticNetwork(sess,
                                   state_dim,
                                   action_dim,
                                   switch_dim,
                                   CRITIC_LEARNING_RATE,
                                   TAU,
                                   actor.get_num_trainable_vars(),
                                   baseline_rate=10.,
                                   control_variance_flag=CONTROL_VARIANCE)

        noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        reward = Reward(REWARD_FACTOR, GAMMA)

        try:
            train(sess, env, actor, critic, noise, reward, discrete,
                  action_bound)
        except KeyboardInterrupt:
            pass
Ejemplo n.º 18
0
    def __init__(self, task, sess):
        self.sess = sess
        self.env = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 0.0001
        self.tau = 0.001
        self.minibatch_size = 64
        self.critic_lr = 0.001
        self.gamma = 0.99
        self.buffer_size = 1000000
        self.random_seed = 1234
        self.summary_dir = "/"
        #self.max_episode = 100
        #self.max_episode_len = 100
        self.mu = 0

        self.actor = ActorNetwork(self.sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.tau, self.minibatch_size)

        self.critic = CriticNetwork(self.sess, self.state_size,
                                    self.action_size, self.critic_lr, self.tau,
                                    self.gamma,
                                    self.actor.get_num_trainable_vars())

        # Initialize replay memory
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed)
        self.sess.run(tf.global_variables_initializer())
        self.actor.update_target_network()
        self.critic.update_target_network()

        self.noise = OUNoise(self.action_size, self.mu)

        self.sess.run(tf.global_variables_initializer())
Ejemplo n.º 19
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.Session()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
        self.sess.run(tf.global_variables_initializer())
        #target_param <- eval_param
        self.actor_network.update_target()
        self.critic_network.update_target()
Ejemplo n.º 20
0
def main():
    tf.reset_default_graph()
    with tf.Session() as sess:
        env = Env(players=5)
        np.random.seed(1)
        tf.set_random_seed(1)
        state_dim = env.observation_space
        action_dim = env.action_space

        # Ensure action bound is symmetric
        crlr = 0.001
        aclr = 0.001
        tau = 0.001
        actor = ActorNetwork(sess, state_dim, action_dim, 15, 32, tau, aclr)
        critic = CriticNetwork(sess, state_dim, action_dim, 15, 32, tau, crlr)
        actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros((env.players,
                                                                action_dim)))

        train(sess, env, actor, critic, actor_noise)
Ejemplo n.º 21
0
def main():
    tf.reset_default_graph()

    with tf.Session() as sess:

        embeddings = generate_embedding_mat(vocab_size, embedding_size)

        embed_file = './data/emb_mat.pkl'
        if not os.path.exists(embed_file):
            save_embedding_info(embed_file)

        classifier = ModelAPI(model_dir, embed_path)
        classifier.load_config()
        classifier.config["token_emb_mat"] = embeddings
        model = SelfAttRNN()
        classifier.build_graph(sess, model, "/gpu:2")

        actor = ActorNetwork(sess, lstm_dim, optimizer, lr, embeddings)

        saver = tf.train.Saver()
        model_file = "./checkpoints/{}".format(model_name)
        restore_model(sess, saver, model_file)
        #        params = get_model_params(sess)
        #        get_simplify(sess, actor)

        epoch = 5
        try:
            for e in range(epoch):
                if use_RL:
                    train(sess,
                          actor,
                          classifier,
                          batchsize,
                          classifier_trainable=True)
                else:
                    train_classifier(sess, classifier)

                saver.save(sess, model_file)

        except KeyboardInterrupt:
            print(
                '[INFO] Interrupt manually, try saving checkpoint for now...')
            saver.save(sess, model_file)
Ejemplo n.º 22
0
def start(env, pt=None, cl_mode=None, norm_complexity=0, **config):

    # block warnings from tf.saver if needed
    if config['mp_debug']:
        tf.logging.set_verbosity(tf.logging.ERROR)

    # Initialize the actor, critic and difference networks
    with tf.Graph().as_default() as ddpg:

        # setup random number generators for predicatbility
        print("Random seed ", config['seed'])
        random.seed(config['seed'])
        np.random.seed(random.randint(0, 1000000))
        tf.set_random_seed(random.randint(0, 1000000))
        env.seed(random.randint(0, 1000000))
        print("Random seed verification (numpy) ", np.random.randint(10000))

        obs_dim = env.observation_space.shape[-1]
        act_dim = env.action_space.shape[-1]

        actor = ActorNetwork(obs_dim, act_dim, 1, config)
        critic = CriticNetwork(obs_dim, act_dim, config,
                               actor.get_num_trainable_vars())

        if config["tensorboard"] == True:
            dir_path = os.path.dirname(os.path.realpath(__file__))
            tf.summary.FileWriter(dir_path, ddpg)

        # create curriculum switching network
        if not config["cl_structure"] or not config["cl_stages"]:
            cl_nn = None
        else:
            cl_nn = CurriculumNetwork(pt.get_v_size(), config, cl_mode)

    if config["compare_with"]:
        with tf.Graph().as_default() as compare_with_graph:
            compare_with_actor = ActorNetwork(obs_dim, act_dim, 1, config)
            CriticNetwork(obs_dim, act_dim, config,
                          compare_with_actor.get_num_trainable_vars())
        print(actor.target_inputs.graph is tf.get_default_graph())
        print(compare_with_actor.target_inputs.graph is tf.get_default_graph())
        print(
            compare_with_actor.target_inputs.graph is actor.target_inputs.graph
        )
        return compare(env, ddpg, actor, critic, compare_with_graph,
                       compare_with_actor, cl_nn, pt, cl_mode, **config)

    return train(env, ddpg, actor, critic, cl_nn, pt, cl_mode, norm_complexity,
                 **config)
Ejemplo n.º 23
0
            actions, action_pos = sampling_random(lenth, paction)
        
        if len(actions) != args.maxlenth:
            print(inputs)
        #predict
        out = critic.predict_target([inputs], [actions], [action_pos], [lenth], [len(action_pos)])
        if np.argmax(out) == np.argmax(solution):
            acc += 1
    return float(acc) / len(test_data)

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config = config) as sess:
    #model
    critic = LSTM_CriticNetwork(sess, args.dim, args.optimizer, args.lr, args.tau, args.grained, args.attention, args.maxlenth, args.dropout, word_vector) 
    actor = ActorNetwork(sess, args.dim, args.optimizer, args.lr, args.tau, critic.get_num_trainable_vars())
    state_size = critic.state_size

    #print variables
    for item in tf.trainable_variables():
        print((item.name, item.get_shape()))
    
    saver = tf.train.Saver()
    
    #LSTM pretrain
    if args.RLpretrain != '':
        pass
    elif args.LSTMpretrain == '':
        sess.run(tf.global_variables_initializer())
        for i in range(0,2):
            train(sess, actor, critic, train_data, args.batchsize, args.samplecnt, RL_trainable=False)
Ejemplo n.º 24
0
    def __init__(self):

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.action_bounds = [[0.3, 0.3],
                                  [-0.3, -0.3]]
            self.grad_inv = GradInverter(self.action_bounds)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(os.path.expanduser('~')+'/tensorboard_data')

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(self.session.graph, self.session, BATCH_SIZE)

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())
                self.critic_network.restore_pretrained_weights(FILTER_LOAD_PATH)
                self.actor_network.restore_pretrained_weights(FILTER_LOAD_PATH)

            threads = tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)
Ejemplo n.º 25
0
class DDPG():
    def __init__(self, task, sess):
        self.sess = sess
        self.env = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 0.0001
        self.tau = 0.001
        self.minibatch_size = 64
        self.critic_lr = 0.001
        self.gamma = 0.99
        self.buffer_size = 1000000
        self.random_seed = 1234
        self.summary_dir = "/"
        #self.max_episode = 100
        #self.max_episode_len = 100
        self.mu = 0

        self.actor = ActorNetwork(self.sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.tau, self.minibatch_size)

        self.critic = CriticNetwork(self.sess, self.state_size,
                                    self.action_size, self.critic_lr, self.tau,
                                    self.gamma,
                                    self.actor.get_num_trainable_vars())

        # Initialize replay memory
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed)
        self.sess.run(tf.global_variables_initializer())
        self.actor.update_target_network()
        self.critic.update_target_network()

        self.noise = OUNoise(self.action_size, self.mu)

        self.sess.run(tf.global_variables_initializer())

    def reset_episode(self):
        #self.actor_noise.reset()
        state = self.env.reset()
        self.last_state = state
        self.ep_ave_max_q = 0
        self.ep_reward = 0
        return state

    def step(self, s, a, r, terminal, s2):
        # Save experience / reward
        #self.memory.add(self.last_state, action, reward, next_state, done)
        #summary_ops, summary_vars = self.build_summaries()
        self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )),
                               np.reshape(a, (self.actor.a_dim, )), r,
                               terminal, np.reshape(s2, (self.actor.s_dim, )))
        # Learn, if enough samples are available in memory
        if self.replay_buffer.size() > self.minibatch_size:

            s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch(
                self.minibatch_size)
            #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch)
            target_q = self.critic.predict_target(
                s2_batch, self.actor.predict_target(s2_batch))

            y_i = []
            for k in range(self.minibatch_size):
                if t_batch[k]:
                    y_i.append(r_batch[k])
                else:
                    y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                    # Update the critic given the targets
            predicted_q_value, _ = self.critic.train(
                s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

            #self.ep_ave_max_q += np.amax(predicted_q_value)

            # Update the actor policy using the sampled gradient
            a_outs = self.actor.predict(s_batch)
            grads = self.critic.action_gradients(s_batch, a_outs)
            self.actor.train(s_batch, grads[0])

            # Update target networks
            self.actor.update_target_network()
            self.critic.update_target_network()

        # Roll over last state and action
        self.last_state = s2
        '''
        self.ep_reward +=r
        
        if terminal:
            
            summary_str = self.sess.run(
            , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)})

            writer.add_summary(summary_str, i)
            #writer.flush()
            
            print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \
                             (self.ep_ave_max_q / float(j))))
             '''

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])

        actions = self.actor.predict(states)[0]
        #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size))
        #print(actions)

        return actions + self.noise.sample()  # add some noise for exploration

    def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch):

        target_q = self.critic.predict_target(
            s2_batch, self.actor.predict_target(s2_batch))

        y_i = []
        for k in range(self.minibatch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                # Update the critic given the targets
        predicted_q_value, _ = self.critic.train(
            s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

        #self.ep_ave_max_q += np.amax(predicted_q_value)

        # Update the actor policy using the sampled gradient
        a_outs = self.actor.predict(s_batch)
        grads = self.critic.action_gradients(s_batch, a_outs)
        self.actor.train(s_batch, grads[0])

        # Update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()

    def build_summaries(self):
        episode_reward = tf.Variable(0.)
        tf.summary.scalar("Reward", episode_reward)
        episode_ave_max_q = tf.Variable(0.)
        tf.summary.scalar("Qmax Value", episode_ave_max_q)

        summary_vars = [episode_reward, episode_ave_max_q]
        summary_ops = tf.summary.merge_all()

        return summary_ops, summary_vars
Ejemplo n.º 26
0
class DDPG:

    def __init__(self):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)

    def train(self):

        # Check if the buffer is big enough to start training
        if self.data_manager.enough_data():

            # get the next random batch from the data manger
            state_batch, \
                action_batch, \
                reward_batch, \
                next_state_batch, \
                is_episode_finished_batch = self.data_manager.get_next_batch()

            state_batch = np.divide(state_batch, 100.0)
            next_state_batch = np.divide(next_state_batch, 100.0)

            # Are we visualizing the first state batch for debugging?
            # If so: We have to scale up the values for grey scale before plotting
            if self.visualize_input:
                state_batch_np = np.asarray(state_batch)
                state_batch_np = np.multiply(state_batch_np, -100.0)
                state_batch_np = np.add(state_batch_np, 100.0)
                self.viewer.set_data(state_batch_np)
                self.viewer.run()
                self.visualize_input = False

            # Calculate y for the td_error of the critic
            y_batch = []
            next_action_batch = self.actor_network.target_evaluate(next_state_batch)
            q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch)

            for i in range(0, BATCH_SIZE):
                if is_episode_finished_batch[i]:
                    y_batch.append([reward_batch[i]])
                else:
                    y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

            # Now that we have the y batch lets train the critic
            self.critic_network.train(y_batch, state_batch, action_batch)

            # Get the action batch so we can calculate the action gradient with it
            # Then get the action gradient batch and adapt the gradient with the gradient inverting method
            action_batch_for_gradients = self.actor_network.evaluate(state_batch)
            q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients)
            q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients)

            # Now we can train the actor
            self.actor_network.train(q_gradient_batch, state_batch)

            # Save model if necessary
            if self.training_step > 0 and self.training_step % SAVE_STEP == 0:
                self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step)

            # Update time step
            self.training_step += 1

        self.data_manager.check_for_enqueue()

    def get_action(self, state):

        # normalize the state
        state = state.astype(float)
        state = np.divide(state, 100.0)

        # Get the action
        self.action = self.actor_network.get_action(state)

        # Are we using noise?
        if self.noise_flag:
            # scale noise down to 0 at training step 3000000
            if self.training_step < MAX_NOISE_STEP:
                self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise()
            # if action value lies outside of action bounds, rescale the action vector
            if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]:
                self.action *= np.fabs(A0_BOUNDS[0]/self.action[0])
            if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]:
                self.action *= np.fabs(A1_BOUNDS[0]/self.action[1])

        # Life q value output for this action and state
        self.print_q_value(state, self.action)

        return self.action

    def set_experience(self, state, reward, is_episode_finished):

        # Make sure we're saving a new old_state for the first experience of every episode
        if self.first_experience:
            self.first_experience = False
        else:
            self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state,
                                                       is_episode_finished)

            # Uncomment if collecting data for the auto_encoder
            # experience = (self.old_state, self.old_action, reward, state, is_episode_finished)
            # self.buffer.append(experience)

        if is_episode_finished:
            self.first_experience = True
            self.exploration_noise.reset()

        # Safe old state and old action for next experience
        self.old_state = state
        self.old_action = self.action

    def print_q_value(self, state, action):

        string = "-"
        q_value = self.critic_network.evaluate([state], [action])
        stroke_pos = 30 * q_value[0][0] + 30
        if stroke_pos < 0:
            stroke_pos = 0
        elif stroke_pos > 60:
            stroke_pos = 60
        print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \
            "\tt: ", self.training_step
Ejemplo n.º 27
0
class GAIL:
    def __init__(self,
                 exp_dir,
                 exp_thresh,
                 state_dim,
                 action_dim,
                 learn_rate,
                 betas,
                 _device,
                 _gamma,
                 load_weights=False):
        """
            exp_dir : directory containing the expert episodes
         exp_thresh : parameter to control number of episodes to load 
                      as expert based on returns (lower means more episodes)
          state_dim : dimesnion of state 
         action_dim : dimesnion of action
         learn_rate : learning rate for optimizer 
            _device : GPU or cpu
            _gamma  : discount factor
     _load_weights  : load weights from directory
        """

        # storing runtime device
        self.device = _device

        # discount factor
        self.gamma = _gamma

        # Expert trajectory
        self.expert = ExpertTrajectories(exp_dir, exp_thresh, gamma=self.gamma)

        # Defining the actor and its optimizer
        self.actor = ActorNetwork(state_dim).to(self.device)
        self.optim_actor = torch.optim.Adam(self.actor.parameters(),
                                            lr=learn_rate,
                                            betas=betas)

        # Defining the discriminator and its optimizer
        self.disc = Discriminator(state_dim, action_dim).to(self.device)
        self.optim_disc = torch.optim.Adam(self.disc.parameters(),
                                           lr=learn_rate,
                                           betas=betas)

        if not load_weights:
            self.actor.apply(init_weights)
            self.disc.apply(init_weights)
        else:
            self.load()

        # Loss function crtiterion
        self.criterion = torch.nn.BCELoss()

    def get_action(self, state):
        """
            obtain action for a given state using actor network 
        """
        state = torch.tensor(state, dtype=torch.float,
                             device=self.device).view(1, -1)
        return self.actor(state).cpu().data.numpy().flatten()

    def update(self, n_iter, batch_size=100):
        """
            train discriminator and actor for mini-batch
        """
        # memory to store
        disc_losses = np.zeros(n_iter, dtype=np.float)
        act_losses = np.zeros(n_iter, dtype=np.float)

        for i in range(n_iter):

            # Get expert state and actions batch
            exp_states, exp_actions = self.expert.sample(batch_size)
            exp_states = torch.FloatTensor(exp_states).to(self.device)
            exp_actions = torch.FloatTensor(exp_actions).to(self.device)

            # Get state, and actions using actor
            states, _ = self.expert.sample(batch_size)
            states = torch.FloatTensor(states).to(self.device)
            actions = self.actor(states)
            '''
                train the discriminator
            '''
            self.optim_disc.zero_grad()

            # label tensors
            exp_labels = torch.full((batch_size, 1), 1, device=self.device)
            policy_labels = torch.full((batch_size, 1), 0, device=self.device)

            # with expert transitions
            prob_exp = self.disc(exp_states, exp_actions)
            exp_loss = self.criterion(prob_exp, exp_labels)

            # with policy actor transitions
            prob_policy = self.disc(states, actions.detach())
            policy_loss = self.criterion(prob_policy, policy_labels)

            # use backprop
            disc_loss = exp_loss + policy_loss
            disc_losses[i] = disc_loss.mean().item()

            disc_loss.backward()
            self.optim_disc.step()
            '''
                train the actor
            '''
            self.optim_actor.zero_grad()
            loss_actor = -self.disc(states, actions)
            act_losses[i] = loss_actor.mean().detach().item()

            loss_actor.mean().backward()
            self.optim_actor.step()

        print("Finished training minibatch")

        return act_losses, disc_losses

    def save(
            self,
            directory='/home/aman/Programming/RL-Project/Deterministic-GAIL/weights',
            name='GAIL'):
        torch.save(self.actor.state_dict(),
                   '{}/{}_actor.pth'.format(directory, name))
        torch.save(self.disc.state_dict(),
                   '{}/{}_discriminator.pth'.format(directory, name))

    def load(
            self,
            directory='/home/aman/Programming/RL-Project/Deterministic-GAIL/weights',
            name='GAIL'):
        print(os.getcwd())
        self.actor.load_state_dict(
            torch.load('{}/{}_actor.pth'.format(directory, name)))
        self.disc.load_state_dict(
            torch.load('{}/{}_discriminator.pth'.format(directory, name)))

    def set_mode(self, mode="train"):

        if mode == "train":
            self.actor.train()
            self.disc.train()
        else:
            self.actor.eval()
            self.disc.eval()
Ejemplo n.º 28
0
weights_dir = "./weights"  # directory to store weights
logs_dir = "./logs"  # directory to store tensorboard logs

# Defining torcs environment
env = TorcsEnv(vision=VISION, throttle=True, gear_change=False)
state_size = 29  # number of parameters in state space
action_size = 3  # number of parameters in action space

exp_num = 1  # experiment number

# Define the GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Selected {} device".format(device))

# Defining the policy agent
actor = ActorNetwork(state_size).to(device)

# Loading the learnt model
actor.load_state_dict(
    torch.load(
        '/home/aman/Programming/RL-Project/Deterministic-GAIL/weights/GAIL_actor.pth'
    ))
actor.eval()

# Defining tensorboard agent
writer = SummaryWriter(logs_dir + "/Testing-{}".format(exp_num))


def write_arr_tb(arr, writer, name, start_index):
    """
    arr         : 1D array to write to tensorboard
Ejemplo n.º 29
0
def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False):
    with tf.Session() as sess:


        # configuring environment
        env = gym.make(ENV_NAME)
        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)
        # info of the environment to pass to the agent
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary
        # Creating agent
        ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise
        actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE)
        critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE)


        sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

        goal = 0
        max_state = -1.
        try:
            critic.recover_critic()
            actor.recover_actor()
            print('********************************')
            print('models restored succesfully')
            print('********************************')
        except:
            pass
#            print('********************************')
#            print('Failed to restore models')
#            print('********************************')


        for i in range(epochs):

            state = env.reset()
            state = np.hstack(state)
            ep_reward = 0
            ep_ave_max_q = 0
            done = False
            step = 0
            max_state_episode = -1
            epsilon -= (epsilon/EXPLORE)
            epsilon = np.maximum(min_epsilon,epsilon)


            while (not done):

                if render:
                    env.render()

                #print('step', step)
                # 1. get action with actor, and add noise
                action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1)
                action = action_original + max(epsilon,0)*ruido.noise()


                # remove comment if you want to see a step by step update
                # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode)

                # 2. take action, see next state and reward :
                next_state, reward, done, info = env.step(action)

                if train_indicator:
                    # 3. Save in replay buffer:
                    replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward,
                                      done, np.reshape(next_state, (actor.s_dim,)))

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:

                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE)

                        # Calculate targets

                        # 5. Train critic Network (states,actions, R + gamma* V(s', a')):
                        # 5.1 Get critic prediction = V(s', a')
                        # the a' is obtained using the actor prediction! or in other words : a' = actor(s')
                        target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))

                        # 5.2 get y_t where:
                        y_i = []
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                y_i.append(r_batch[k])
                            else:
                                y_i.append(r_batch[k] + GAMMA * target_q[k])


                        # 5.3 Train Critic!
                        predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                        ep_ave_max_q += np.amax(predicted_q_value)

                        # 6 Compute Critic gradient (depends on states and actions)
                        # 6.1 therefore I first need to calculate the actions the current actor would take.
                        a_outs = actor.predict(s_batch)
                        # 6.2 I calculate the gradients
                        grads = critic.action_gradients(s_batch, a_outs)
                        actor.train(s_batch, grads[0])

                        # Update target networks
                        actor.update_target_network()
                        critic.update_target_network()


                state = next_state
                if next_state[0] > max_state_episode:
                    max_state_episode = next_state[0]

                ep_reward = ep_reward + reward
                step +=1

            if done:
                ruido.reset()
                if state[0] > 0.45:
                    #print('****************************************')
                    #print('got it!')
                    #print('****************************************')
                    goal += 1

            if max_state_episode > max_state:
                max_state = max_state_episode
            print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) )


            # print('Efficiency', 100.*((goal)/(i+1.)))


        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
Ejemplo n.º 30
0
    def __init__(self):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)
Ejemplo n.º 31
0
    rdwords.sort(key = lambda x : x[1], reverse = True)
    outcnt = 0
    for i in range(len(rdwords)):
        if owords[rdwords[i][0]] > 20:
            print(rdwords[i], owords[rdwords[i][0]])
            outcnt += 1
        if outcnt > 20:
            break;
    avelenth = float(total_lenth) / float(len(test_data))
    avedis = float(total_dis) / float(len(test_data))
    #print "average length", avelenth
    #print "average distilled length", avedis
    return float(acc) / len(test_data)

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config = config) as sess:
    #model
    critic = LSTM_CriticNetwork(sess, args.dim, args.optimizer, args.lr, args.tau, args.grained, args.maxlenth, args.dropout, word_vector) 
    actor = ActorNetwork(sess, args.dim, args.optimizer, args.lr, args.tau)
    #print variables
    for item in tf.trainable_variables():
        print (item.name, item.get_shape())
    
    saver = tf.train.Saver()
    
    saver.restore(sess, "checkpoints/best816")

    print(test(sess, actor, critic, dev_data))

Ejemplo n.º 32
0
class DrlAgent:
    def __init__(self,
                 sess,
                 is_train,
                 dim_state,
                 dim_action,
                 num_paths,
                 actor_learn_rate,
                 critic_learn_rate,
                 tau,
                 buffer_size,
                 mini_batch,
                 ep_begin,
                 epsilon_end,
                 gamma,
                 max_epoch,
                 seed=66):
        self.__is_train = is_train
        self.__dim_state = dim_state
        self.__dim_action = dim_action
        self.__mini_batch = mini_batch
        self.__ep_begin = ep_begin
        self.__gamma = gamma
        self.__max_epoch = max_epoch

        self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0,
                                    actor_learn_rate, tau, num_paths)
        self.__critic = CriticNetwork(sess, dim_state, dim_action,
                                      critic_learn_rate, tau)

        self.__replay = ReplayBuffer(buffer_size, seed)

        self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch,
                                   dim_action, num_paths, seed)

        self.__state_curt = np.zeros(dim_state)
        self.__action_curt = self.__explorer.convert_action(
            np.ones(dim_action))

        self.__episode = 0
        self.__step = 0

    def target_paras_init(self):
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()

    def predict(self, state, reward):
        action_original = self.__actor.predict([state])[0]
        if not self.__is_train:
            return action_original

        action = self.__explorer.get_act(action_original)
        self.__replay.add(self.__state_curt, self.__action_curt, reward, state)
        self.__state_curt = state
        self.__action_curt = action

        if len(self.__replay) > self.__mini_batch:
            self.train()

        self.__step += 1
        if self.__step >= self.__max_epoch:
            self.__step = 0
            self.__episode += 1
            self.__explorer.reset_ep(self.__ep_begin)
        return action

    def train(self):
        batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch(
            self.__mini_batch)
        weights = [1.0] * self.__mini_batch
        weights = np.expand_dims(weights, axis=1)
        target_q = self.__critic.predict_target(
            batch_state_next, self.__actor.predict_target(batch_state_next))
        value_q = self.__critic.predict(batch_state, batch_action)

        batch_y = []
        batch_error = []
        for k in range(len(batch_reward)):
            target_y = batch_reward[k] + self.__gamma * target_q[k]
            batch_error.append(abs(target_y - value_q[k]))
            batch_y.append(target_y)

        predicted_q, _ = self.__critic.train(batch_state, batch_action,
                                             batch_y, weights)
        a_outs = self.__actor.predict(batch_state)
        grads = self.__critic.calculate_gradients(batch_state, a_outs)
        weighted_grads = weights * grads[0]
        self.__actor.train(batch_state, weighted_grads)
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()
Ejemplo n.º 33
0
    plt.show()

signal.signal(signal.SIGINT, signal_handler)

#env = gym.make('Pendulum-v0')

ob_space = env.observation_space
ac_space = env.action_space
print("Observation space: ", ob_space,  ob_space.dtype)
print("Action space: ", ac_space, ac_space.n)

s_size = ob_space.shape[0]
a_size = ac_space.n
print('size: ' + str(s_size) + '/' + str(a_size))

actor = ActorNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=ACTOR_LEARNING_RATE, n_h1=N_H1, n_h2=N_H2, tau=TAU)
critic = CriticNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=CRITIC_LEARNING_RATE, n_h1=N_H1, n_h2=N_H2, tau=TAU)
noise = OUProcess(ACTION_SIZE)
exprep = ExpReplay(mem_size=MEM_SIZE, start_mem=START_MEM, state_size=[STATE_SIZE], kth=-1, batch_size=BATCH_SIZE)

sess = tf.Session()
with tf.device('/{}:0'.format('CPU')):
  agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=ACTION_RANGE)
sess.run(tf.initialize_all_variables())

for i in range(NUM_EPISODES):
    cur_state = env.reset()
    cum_reward = 0
    # tensorboard summary
    summary_writer = tf.summary.FileWriter('/tmp/pendulum-log-0'+'/train', graph=tf.get_default_graph())