def __init__(self, env):

        self.sess = tf.Session()
        K.set_session(self.sess)

        # hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 32
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        # create actor and critic networks
        self.actor = Actor(self.sess, self.state_dim, self.action_dim,
                           self.action_bound, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim,
                             self.CRITIC_LEARNING_RATE)

        # initialize for later gradient calculation
        self.sess.run(
            tf.global_variables_initializer())  #<-- no problem without it

        # save the results
        self.save_epi_reward = []
Esempio n. 2
0
    def __init__(self, env):
        # 텐서플로우 세션 설정
        self.sess = tf.Session()
        K.set_session(self.sess)

        # 하이퍼파라미터
        self.GAMMMA = .95
        self.BATCH_SIZE = 32
        self.ACTOR_LEARNING_RATE = .0001
        self.CRITIC_LEARNING_RATE = .001

        #환경
        self.env = env
        #상태변수 차원
        self.state_dim = env.observation_space.shape[0]
        #행동 차원(dimension)
        self.action_dim = env.action_space.shape[0]
        #행동의 최대 크기
        self.action_bound = env.action_space.high[0]

        #액터 신경망 및 크리틱 신경망 생성
        self.actor = Actor(self.sess, self.state_dim, self.action_dim,
                           self.action_bound, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim,
                             self.CRITIC_LEARNING_RATE)

        # 그래디언트 계산을 위한 초기화
        self.sess.run(tf.global_variables_initializer())

        #에피소드에서 얻은 총 보상값 저장
        self.save_epi_reward = []
    def __init__(self, env):

        # hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 32
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        # create actor and critic networks
        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim,
                             self.CRITIC_LEARNING_RATE)

        # save the results
        self.save_epi_reward = []
Esempio n. 4
0
    def __init__(self, env):    # 클래스 초기화 메서드

        # hyper parameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 32
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001

        # 환경
        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound (행동의 최대 크기)
        self.action_bound = env.action_space.high[0]

        ## A2C 알고리즘 1. critic 과 actor 신경망의 파라메터 phi (critic 신경망) 와 theta (actor 신경망)를 초기화한다.
        # create actor and critic networks
        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim, self.CRITIC_LEARNING_RATE);

        # save the results (에피소드에서 얻은 총 보상값을 저장하기 위한 변수)
        self.save_epi_reward = [];