def __init__(self, env, N_STATES, N_ACTIONS, STEPS, BATCH_SIZE): self.N_STATES = N_STATES self.N_ACTIONS = N_ACTIONS self.STEPS = STEPS self.BATCH_SIZE = BATCH_SIZE #mini batch size self.critic_net = CriticNet(self.N_STATES, self.N_ACTIONS, self.STEPS, self.BATCH_SIZE) self.actor_net = ActorNet(self.N_STATES, self.N_ACTIONS, self.STEPS, self.BATCH_SIZE) self.R = []
def __init__(self, ob_shape, ac_shape, ac_max=5.0, ac_min=-5.0): self.num_states = ob_shape self.num_actions = ac_shape self.action_max = ac_max self.action_min = ac_min self.replay_buffer = deque() self.critic_net = CriticNet(self.num_states, self.num_actions, self.action_max, self.action_min) self.actor_net = ActorNet(self.num_states, self.num_actions, self.action_max)
def __init__( self, hisar_size, ar_size, action_size, TAU = 0.001, is_batch_norm = 0, write_sum = 0, net_size_scale=1, max_load=1, beta0=beta): self.hisar_size = hisar_size self.load_size = action_size + 1 self.ar_size = ar_size self.state_size = action_size * 2 self.action_size = action_size self.ar_action_size = ar_size + action_size #print("net_size_scale: "+str(net_size_scale)) if is_batch_norm: if len(CN_N_HIDDENS)==2: self.critic_net = CriticNet_bn( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) else: self.critic_net = CriticNet_bn_3( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.actor_net = ActorNet_bn( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.ar_pred_net = ARPredNet_bn( self.hisar_size, self.ar_size, write_sum, net_size_scale ) # arrival rate prediction network self.load_map_net = LoadMapNet_bn( self.ar_size, self.action_size, self.load_size, write_sum, net_size_scale ) # load mapping network else: self.critic_net = CriticNet( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.actor_net = ActorNet( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.ar_pred_net = ARPredNet( self.hisar_size, self.ar_size, write_sum, net_size_scale ) # arrival rate prediction network self.load_map_net = LoadMapNet( self.ar_size, self.action_size, self.load_size, write_sum, net_size_scale ) # load mapping network self.env = ENV( action_size, max_load=max_load, beta0=beta0 ) #self.k_nearest_neighbors = int(max_actions * k_ratio ) #Initialize Network Buffers: self.replay_memory_ac = deque() self.replay_memory_arp = deque() self.replay_memory_lm = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.ones( ( self.action_size ) ).tolist() action_min = np.zeros( ( self.action_size ) ).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter( action_bounds )
def __init__(self, num_states, num_actions, action_space_high, action_space_low, is_batch_norm): self.num_states = num_states self.num_actions = num_actions self.action_space_high = action_space_high self.action_space_low = action_space_low # Batch normalisation disabled. self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) # Replay Memory 초기화 self.replay_memory = deque() # time 초기화 self.time_step = 0 self.counter = 0 action_max = np.array(action_space_high).tolist() action_min = np.array(action_space_low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds)
def __init__(self, env, is_batch_norm=False, is_grad_inverter=True): super().__init__(env) assert isinstance(env.action_space, Box), "action space must be continuous" if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds)
def __init__(self, env, is_batch_norm): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds)
def __init__(self, env, is_batch_norm): self.env = env self.num_states = 1 self.num_actions = 3 if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = [75 + 210, 10 + 160] action_min = [75, 10] action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds)
def __init__(self,env, is_batch_norm): self.env = env self.num_states = 32*16 self.num_actions = 2 if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_bounds = [[1., 1.],[-1.,-1.]] self.grad_inv = grad_inverter(action_bounds)
RENDER_ENV = True GYM_MONITOR_EN = True ENV_NAME = 'Pendulum-v0' ONITOR_DIR = './results/gym_ddpg' ACTION_BOUND = 2 ou = OU() if __name__ == '__main__': env = gym.make(ENV_NAME).env state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high[0] actor = ActorNet(state_dim, HIDDEN1_UNITS, HIDDEN2_UNITS, action_dim) critic = CriticNet(state_dim, action_dim, HIDDEN1_UNITS, HIDDEN2_UNITS, HIDDEN2_UNITS, action_dim) buff = Memory(BUFFER_SIZE, 9) step = 0 reward_result = [] for i in range(MAX_EPISODES): s_t = env.reset() s_t = np.reshape(s_t, (1, 3))[0] total_reward = 0. for j in range(MAX_EP_STEPS): loss = 0 if RENDER_ENV: env.render() a_t = actor.predict(s_t, ACTION_BOUND, target=False) action = a_t + ou.sample(a_t[0])