def __init__(self, state_size, action_size): self.epsilon = 0.8 self.state_size = state_size self.action_size = action_size # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process # self.exploration_mu = 0 # self.exploration_theta = 0.15 # self.exploration_sigma = 0.2 # self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 20000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor self.tau = 0.002 # for soft update of target parameters self.stats = np.array([])
def __init__(self, gamma, memory, s, a, tau, learningRate=1e-3, criticpath=None, actorpath=None): self.gamma = gamma self.memory = ReplayMemory(memory) self.actor = Actor(state=s, actions=a) self.critic = Critic(state=s, actions=a) if (not (criticpath == None)): self.critic.load_state_dict(torch.load(criticpath)) if (not (actorpath == None)): self.actor.load_state_dict(torch.load(actorpath)) self.targetActor = Actor(state=s, actions=a) self.targetActor.load_state_dict(self.actor.state_dict()) self.targetCritic = Critic(state=s, actions=a) self.targetCritic.load_state_dict(self.critic.state_dict()) self.tau = tau self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate) self.criticOptimizer = optim.Adam(self.critic.parameters(), learningRate) #more a dimensionality thing self.state = s self.action = a self.OUarray = np.zeros((1000, self.action), dtype="f") self.step = 0
def __init__(self, sess, scale_u, params): self.sess = sess self.scale_u = scale_u self.__dict__.update(params) # CREATE INPUT PLACEHOLDERS self.create_input_placeholders() # INITIALIZE ACTOR & CRITIC MODELS self.agents = [ Actor(self.sess, self.inputs, i, **self.actor_params) for i in [1, 2, 3] ] self.critic = Critic(self.sess, self.inputs, **self.critic_params) # INITIALIZE EXPLORATION MODEL self.noise_params = { k: np.fromstring(v, sep=",", dtype="f") for k, v in self.noise_params.items() } self.noise = [Noise(**self.noise_params) for _ in range(3)] # INITIALIZE REPLAY BUFFER self.memory = Memory(self.memory_size) # AVERAGE AGENT POLICIES avg_pi = [ tf.reduce_mean(i, axis=0) for i in zip(*[x.pi.net_params for x in self.agents]) ] self.avg_op = [ tf.assign(i, j) for x in self.agents for i, j in zip(x.pi.net_params, avg_pi) ]
def __init__(self, state_size, action_size, seed=0): '''Initlize the Agent. Parameters ---------- state_size : int The dimension of each state action_size : int The dimension of each action seed : int The random seed used to generate random numbers. ''' self.state_size = state_size self.action_size = action_size random.seed(seed) #actor gives the best action for given state self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) #evaluates the action self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LEARNING_RATE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=WEIGHT_DECAY) #Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) #Noise self.noise = OUNoise(action_size,seed) self.t_step = 0
def main(): no_of_wins = 0 no_of_ties = 0 no_of_losses = 0 print(" algorithm gaining experience") game_generator = Experiment(BOARD_DIMENSION) generalizer = Generalizer(19) #19 because of 18 features + one constant w0 critic = Critic(generalizer) for i in range(NUMBER_OF_EXaMPLES): board = game_generator.generate_board() performance_system = PerformanceMeasure(board, generalizer, critic, game_generator) result = performance_system.improve_system() examples, values = critic.fetch_training_examples() generalizer.set_training_examples(examples) generalizer.set_training_values(values) generalizer.LMS_weight_update_rule() if result == 100: no_of_wins += 1 if result == -100: no_of_losses += 1 if result == 0: no_of_ties += 1 W = generalizer.get_weights() print(no_of_wins, no_of_ties, no_of_losses) """ Playing against human... """ while True: human_board = GameBoard([ '-', '-', '-', '-', '-', '-', '-', '-', '-', ]) vs_human = human_board.victor() while vs_human is None: x = int(input(" X coordinate: ")) y = int(input(" Y coordinate: ")) human_board._board[x * 3 + y] = 'X' computer_position = human_board.maximizer(W) human_board._board[computer_position] = 'O' print(human_board) vs_human = human_board.victor()
def start(GAME_NAME, MAX_EPISODE): env = gym.make(GAME_NAME) # create enviornment actor = Actor(env.observation_space, env.action_space) # create actor critic = Critic(env.observation_space, env.action_space) # create critic reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = MAX_EPISODE RENDER = False MAX_EP_STEPS = 1000 #DISPLAY_REWARD_THRESHOLD=200 #print ("begin.\n\n") for i_episode in range(MAX_EPISODE): s = env.reset() critic.reset() actor.reset() track_r = [] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) #if done: r = -20 # Penalty if die track_r.append(r) td_error, abs_error = critic.learn(s, r, s_) # Critic Learn actor.learn(s, a, td_error) # Actor Learn s = s_ #print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def __init__(self): tf.reset_default_graph() self.sess = tf.Session() self.actor = Actor(self.sess, \ n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \ lr=Config.LEARNING_RATE_START, action_bound=[-math.pi, math.pi]) self.critic = Critic(self.sess, \ n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \ lr=Config.LEARNING_RATE_START) self.sess.run(tf.global_variables_initializer())
def __init__(self, state_size, action_size, num_agents): """ Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents in the environment """ random_seed = 10.0 self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(random_seed) self.num_agents = num_agents # Replay memory self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed) # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Make sure the Actor Target Network has the same weight values as the Local Network for target, local in zip(self.actor_target.parameters(), self.actor_local.parameters()): target.data.copy_(local.data) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) """ self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) """ # Make sure the Critic Target Network has the same weight values as the Local Network for target, local in zip(self.critic_target.parameters(), self.critic_local.parameters()): target.data.copy_(local.data) self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed)
def __init__(self, env, batchSize = 10, bufferSize = 100, gamma = 0.98, actorLR = 1e-4, criticLR = 1e-3, maxSteps = 200, targetUpdate = 1e-3, epsilon = 1, decay = 0.99, rewardScale = 1e-3, logFile = 'run.log'): self.env = env self.gamma = gamma self.batchSize = batchSize self.bufferSize = bufferSize self.maxSteps = maxSteps + 1 self.rewardScale = rewardScale self.epsilon = epsilon self.decay = decay # Useful helpers. self.actionDim = self.env.action_space.shape[0] self.stateDim = self.env.observation_space.shape[0] self.featureDim = self.actionDim + self.stateDim self.minAction = self.env.action_space.low self.maxAction = self.env.action_space.high # For scaling output action values. self.actionBiasZeroOne = self.minAction self.actionScaleZeroOne = self.maxAction - self.minAction self.actionBiasTanH = (self.maxAction + self.minAction) / 2.0 self.actionScaleTanH = self.maxAction - self.actionBiasTanH # Initialize noise process. self.noise = OUNoise(self.actionDim) # Initialize replay buffer. self.buffer = ReplayBuffer(self.bufferSize) # Initialize logging. logging.basicConfig(filename = logFile, level = logging.INFO, format = '[%(asctime)s] %(message)s', datefmt = '%m/%d/%Y %I:%M:%S %p') logging.info('Initializing DRPG agent with passed settings.') # Tensorflow GPU optimization. config = tf.ConfigProto() # GPU fix? config.gpu_options.allow_growth = True self.sess = tf.Session(config = config) from keras import backend as K K.set_session(self.sess) # Make actor network (creates target model internally). self.actor = Actor(self.sess, self.maxSteps, self.featureDim, self.actionDim, self.batchSize, targetUpdate, actorLR, self.actionScaleTanH, self.actionBiasTanH) # Make critic network (creates target model internally). self.critic = Critic(self.sess, self.maxSteps, self.featureDim, self.actionDim, self.batchSize, targetUpdate, actorLR)
def load_weights(self, load_from): checkpoint = torch.load(load_from) critic_params_and_state_dict = checkpoint[ 'critic_params_and_state_dict'] actor_params_and_state_dict = checkpoint['actor_params_and_state_dict'] self.actor = Actor(actor_params_and_state_dict['actor_params']) self.actor.load_state_dict(actor_params_and_state_dict['state_dict']) self.critic = Critic(critic_params_and_state_dict['critic_params']) self.critic.load_state_dict(critic_params_and_state_dict['state_dict']) return self
def main(): wins = 0; ties = 0; loses = 0; print " Playing against itself... Please wait... " generator = Generator(BOARD_LENGTH) generalizer = Generalizer(19) critic = Critic(generalizer) for i in xrange(NO_OF_TRAINING_EXAMPLES): board = generator.generate_board() performance_system = PerformanceSystem(board, generalizer, critic, generator) result = performance_system.improve_system() examples, values = critic.get_training_examples() generalizer.set_training_examples(examples) generalizer.set_training_values(values) generalizer.gradient_descent() if result == 100: wins += 1 if result == -100: loses += 1 if result == 0: ties += 1 W = generalizer.get_weights() print wins, ties, loses """ Playing against human... """ while True: human_board = Board(['-', '-', '-', '-', '-', '-', '-', '-', '-', ]) vs_human = human_board.winner() while vs_human is None: x = int(raw_input(" X coordinate: ")) y = int(raw_input(" Y coordinate: ")) human_board._board[x*3 + y] = 'X' computer_position = human_board.max_learner_utility(W) human_board._board[computer_position] = 'O' print human_board vs_human = human_board.winner()
def __init__(self, env, sess, LEARNING_RATE_ACTOR, LEARNING_RATE_CRITIC, NET_SIZE, MEMORY_LEN, REWARD_DISCOUNT, BATCH_SIZE, TAU, EXPLORATION_STEPS, VERBOSE, LOG_DIR_TF): self.env = env self.sess = sess self.observation_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.shape[0] self.REWARD_DISCOUNT = REWARD_DISCOUNT self.TAU = TAU self.BATCH_SIZE = BATCH_SIZE self.noise_state = np.zeros(self.action_space) self.EXPLORATION_STEPS = EXPLORATION_STEPS self.VERBOSE = VERBOSE self.LOG_DIR_TF = LOG_DIR_TF #check if action_space is symmetric if all(env.action_space.high == abs(env.action_space.low)): action_scale = env.action_space.high else: raise ActionSpaceNotSymmetricException self.actor = Actor(self.sess, self.observation_space, self.action_space, LEARNING_RATE_ACTOR, NET_SIZE, TAU, action_scale) self.critic = Critic(self.sess, self.observation_space, self.action_space, LEARNING_RATE_CRITIC, NET_SIZE, TAU) actor_network_variables = self.actor.network.get_variables() critic_q_net_variables = self.critic.q_net.get_variables() self.actor_target_update = self.actor.target_network.update_variables( actor_network_variables) self.critic_target_update = self.critic.target_q_net.update_variables( critic_q_net_variables) self.reward_pl = tf.placeholder(tf.float32, [None, 1], name='Reward_PL') self.done_pl = tf.placeholder(tf.bool, [None, 1], name='Done_PL') self.labels = tf.where( self.done_pl, self.reward_pl, self.reward_pl + tf.multiply(self.REWARD_DISCOUNT, self.critic.target_prediction)) #self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE) self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE, self.observation_space, self.action_space) self.log_reward_pl = tf.placeholder(tf.float32, name='Reward_log_pl') self.reward_f = tf.add(0.0, self.log_reward_pl) tf.summary.scalar('reward', self.reward_f) init = tf.global_variables_initializer() self.sess.run(init) self.sess.run(self.actor.network.copy_to(self.actor.target_network)) self.sess.run(self.critic.q_net.copy_to(self.critic.target_q_net)) self.writer = tf.summary.FileWriter(self.LOG_DIR_TF, self.sess.graph) self.merged = tf.summary.merge_all()
def __init__(self, env, sess): # Environment self.n_state = env.observation_space.shape[0] self.n_action = env.action_space.shape[0] # Neural Networks self.sess = sess self.actor = Actor(self.sess, self.n_state, self.n_action) self.critic = Critic(self.sess, self.n_state, self.n_action) # Replay Buffer self.replay_buffer = ReplayBuffer(BUFFER_SIZE) # Ornstein-Uhlenbeck Noise self.exploration_noise = OUNoise(self.n_action)
def __init__(self, n_features, actions=None, is_continues=None): self.actions = actions self.is_continues = is_continues self.actor_net = Actor(n_features, actions=actions, is_continues=is_continues) self.critic_net = Critic(n_features) self.load_weights(self.actor_net) self.load_weights(self.critic_net) # we need a good teacher, so the teacher should learn faster than the actor self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99)) self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.gamma = Config.REWARD_DECAY
def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent Params ====== state_size (int): state dimension action_size (int): action dimension num_agents (int): simultaneous running agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents random.seed(random_seed) # Actor Network and its target network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network and its target network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise object self.noise = OUNoise((num_agents, action_size), random_seed) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, device, random_seed) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0
def __init__(self): self.env = snake_env() self.state_dim = (self.env.size, self.env.size) self.action_dim = self.env.action_space self.actor = Actor(self.state_dim, self.action_dim, args.actor_lr) self.critic = Critic(self.state_dim, args.critic_lr) self.gamma = args.gamma if args.load_weights: self.actor.model.load_weights(args.load_weights) if args.dist_move_reward: self.env.set_reward(move_reward='-dist') # initialize video system only self.env.reset()
def run(): # build environment using openai gym env = gym.make('MountainCar-v0') env = env.unwrapped sess = tf.Session() # create an actor and critic actor = Actor(sess, n_actions=n_actions, n_features=n_features, lr=lr_actor) critic = Critic(sess, n_features=n_features, lr=lr_critic) # build the two networks actor.build_net() critic.build_net() sess.run(tf.global_variables_initializer()) # tf.summary.FileWriter("",sess.graph) # count steps step = 0 # env.render() for episode in range(n_episodes): s = env.reset() # comment the render() to speed up # env.render() # s returned by gym is a vector, we need to transform it into a matrix s = s[np.newaxis, :] a = actor.choose_action(s) while (True): step += 1 # a new transition s_, r, done, info = env.step(a) # in order to let s_ add one rank(matrix) s_ = s_[np.newaxis, :] a_ = actor.choose_action(s_) # calculate td_error td_error = critic.learn(s, s_) actor.learn(s, a, td_error) s = s_ if step % 500 == 0: print(step, s_) if done: print('arrive') print(s_) break
def __init__(self, n_features, action_bounds): self.n_features = n_features self.action_bounds = action_bounds self.eval_actor_net = Actor(n_features, action_bounds) self.load_weights(self.eval_actor_net) self.eval_actor_net.train() self.target_actor_net = copy.deepcopy(self.eval_actor_net) self.target_actor_net.eval() self.eval_critic_net1 = Critic(n_features, action_bounds) self.load_weights(self.eval_critic_net1) self.eval_critic_net1.train() self.eval_critic_net2 = Critic(n_features, action_bounds) self.load_weights(self.eval_critic_net2) self.eval_critic_net2.train() self.target_critic_net1 = copy.deepcopy(self.eval_critic_net1) self.target_critic_net1.eval() self.target_critic_net2 = copy.deepcopy(self.eval_critic_net2) self.target_critic_net2.eval() self.memory = Memory(Config.MEMORY_CAPACITY) self.batch_size = Config.BATCH_SIZE self.tau = Config.REPLACEMENT_SOFT_TAU # we need a good teacher, so the teacher should learn faster than the actor self.optimizer_actor = torch.optim.Adam( self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99)) # itertools.chain(self.encoder.parameters(), self.decoder.parameters()) # self.optimizer_critic = \ # torch.optim.Adam([{'params': self.eval_critic_net1.parameters()}, # {'params': self.eval_critic_net2.parameters()}], Config.LR_CRITIC, (0.9, 0.99)) self.optimizer_critic1 = \ torch.optim.Adam(self.eval_critic_net1.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.optimizer_critic2 = \ torch.optim.Adam(self.eval_critic_net2.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.gamma = Config.REWARD_DECAY self.policy_noise_clip = Config.POLICY_NOISE_CLIP self.policy_delay = Config.DELAY_POLICY_UPDATE_ITER self.learn_iter = 0
def __init__(self, state_size, action_size, action_low, action_high): # self.task = task self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high # learning rates self.lr_actor = 1e-4 self.lr_critic = 1e-3 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.lr_actor) self.actor_target = Actor(self.state_size, self.action_size, self.lr_actor) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.lr_critic) self.critic_target = Critic(self.state_size, self.action_size, self.lr_critic) # store model architecture of actor and critic locally # keras.utils.plot_model(self.actor_local.model, '/home/danie/catkin_ws/src/ddpg/src/actor.png', show_shapes=True) # keras.utils.plot_model(self.critic_local.model, '/home/danie/catkin_ws/src/ddpg/src/critic.png', show_shapes=True) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Initialize OU noise self.noise = OUNoise(action_size=self.action_size) # Currently testing with Gaussian noise instead of OU. Parameters for Gaussian follow self.noise_mean = 0.0 self.noise_stddev = 0.2 # Initialize replay buffer self.buffer_size = 1e6 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for DDPG self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters
def __init__(self, params): self.action_size = params['action_size'] self.state_size = params['state_size'] self.num_agents = params['num_agents'] self.buffer_size = params['buffer_size'] self.batch_size = params['batch_size'] self.__gamma = params['gamma'] self.__tau = params['tau'] self.__update_every = params['update_every'] self.__save_to = params['save_to'] self.__memory = ReplayBuffer(self.buffer_size, self.batch_size) self.__lr = params['lr'] self.noise_type = params['noise_type'] actor_params = dict() actor_params['arch_params_actor'] = params['arch_params_actor'] actor_params['action_size'] = self.action_size actor_params['state_size'] = self.state_size actor_params['eps'] = params['eps'] actor_params['eps_decay'] = params['eps_decay'] actor_params['eps_min'] = params['min_eps'] actor_params['noise_type'] = params['noise_type'] self.actor = Actor(actor_params) self.actor_target = Actor(actor_params) self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.__lr) self.scheduler_actor = optim.lr_scheduler.StepLR(self.optimizer_actor, step_size=100, gamma=0.95) critic_params = dict() critic_params['arch_params_critic'] = params['arch_params_critic'] critic_params['action_size'] = self.action_size critic_params['state_size'] = self.state_size self.critic = Critic(critic_params) self.critic_target = Critic(critic_params) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.__lr) self.scheduler_critic = optim.lr_scheduler.StepLR(self.optimizer_actor, step_size=100, gamma=0.95) self.__t = 0
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor( self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor( self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau_actor = 0.1 # for soft update of target parameters self.tau_critic = 0.1
def __init__(self, state_dim, action_dim): self.name = 'ActorCritic' self.state_dim = state_dim self.action_dim = action_dim self.time_update = 0 self.sess = tf.InteractiveSession() #initialize actor and critic network self.actor = Actor(self.sess, state_dim, action_dim) self.critic = Critic(self.sess, state_dim, action_dim) # explore parameter self.sigma = 2 self.sigma_decay = 0.5 self.sigma_min = 0.1 self.sigma_decay_step = 50000 self.sigma_count = 0 # save network self.saver = tf.train.Saver()
class NetworkAC(object): """docstring for NetworkAC.""" def __init__(self): tf.reset_default_graph() self.sess = tf.Session() self.actor = Actor(self.sess, \ n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \ lr=Config.LEARNING_RATE_START, action_bound=[-math.pi, math.pi]) self.critic = Critic(self.sess, \ n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \ lr=Config.LEARNING_RATE_START) self.sess.run(tf.global_variables_initializer()) def train(self, x, a, y, r): td_error = self.critic.learn(x, r, y) # gradient = grad[r + gamma * V(y_) - V(x_)] self.actor.learn(x, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] def predict(self, state): action = self.actor.choose_action(state) value = self.critic.predict(state) return action, value
def __init__(self, state_size=28, action_size=2, gamma=0.9, learning_rate_actor=0.0001, learning_rate_critic=0.01, tau=0.001, action_max=[1000, 2], batch_size=32): self.state_size = state_size self.action_size = action_size self.action_max = action_max self.batch_size = batch_size self.memory = deque(maxlen=5000) self.gamma = gamma # discount rate self.learning_rate_actor = learning_rate_actor # learning rate self.learning_rate_critic = learning_rate_critic self.tau = tau # target transfer factor self.gpu_options = tf.GPUOptions() self.config = tf.ConfigProto(gpu_options=self.gpu_options) self.config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.config) K.set_session(self.sess) self.actor = Actor(state_size=self.state_size, action_size=self.action_size, learning_rate=self.learning_rate_actor, tau=self.tau, sess=self.sess, batch_size=self.batch_size, action_max=self.action_max) self.critic = Critic(state_size=self.state_size, action_size=self.action_size, learning_rate=self.learning_rate_critic, gamma=self.gamma, tau=self.tau, sess=self.sess, batch_size=self.batch_size) self.grad_avg = 0 self.grad_a = [] self.critic_loss_a = []
def __init__(self, a_dim, s_dim): self.sess = tf.Session() self.a_dim, self.s_dim = a_dim, s_dim self.LR_A = 0.001 self.LR_C = 0.001 self.CAPACITY = 10000 self.BATCH_SIZE = 32 self.BATCH_SIZE_g = 24 self.SETTING = { 'GAMMA': 0.9, 'TAU': 0.01, 'N_D_MAX': 1 / np.sqrt(self.s_dim), 'N_D_MIN': -1 / np.sqrt(self.s_dim), 'F_N_D_MAX': 3e-3, 'F_N_D_MIN': -3e-3, 'L2_DECAY': 0.01, } self.S = tf.placeholder(tf.float32, shape=[None, self.s_dim], name='State') self.S_ = tf.placeholder(tf.float32, shape=[None, self.s_dim], name='State_') self.R = tf.placeholder(tf.float32, shape=[None, 1], name='Reward') self.actor = Actor(self.sess, self.a_dim, self.s_dim, self.LR_A, self.SETTING, self.S, self.S_) self.critic = Critic(self.sess, self.a_dim, self.s_dim, self.LR_C, self.SETTING, self.S, self.S_, self.R, self.actor.action, self.actor.action_) self.actor.add_grad_to_graph(self.critic.a_grads) self.memory = Memory(self.CAPACITY, s_dim * 2 + a_dim + 1, self.BATCH_SIZE) # self.memory_g = Memory(self.CAPACITY, s_dim * 2 + a_dim + 1, self.BATCH_SIZE_g) self.sess.run(tf.global_variables_initializer()) tf.summary.FileWriter('logs/', self.sess.graph)
def __init__(self, gamma, s, a, learningRate=1e-3, criticpath=None, actorpath=None): self.gamma = gamma self.actor = Actor(state=s, actions=a, hidden1=180, hidden2=87) self.critic = Critic(state=s, actions=a, hidden1=250, hidden2=100) if (not (criticpath == None)): self.critic.load_state_dict(torch.load(criticpath)) if (not (actorpath == None)): self.actor.load_state_dict(torch.load(actorpath)) self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate) self.criticOptimizer = optim.Adam(self.critic.parameters(), learningRate) #more a dimensionality thing self.state = s self.action = a self.count = 0
def __init__(self, state_size, action_size, num_agents): """ Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents in the environment """ random_seed = 1 self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents # Replay memory self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Noise process self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed) # Critic Networks self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
def __init__(self): self.sess = tf.Session() self.critic1 = Critic() self.tfs = tf.placeholder(tf.float32, [None, configs.S_DIM], 'state') pi, pi_params = self._build_anet('pi', trainable=True) oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) with tf.variable_scope('sample_action'): self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action with tf.variable_scope('update_oldpi'): self.update_oldpi_op = [ oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params) ] self.tfa = tf.placeholder(tf.float32, [None, configs.A_DIM], 'action') self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') with tf.variable_scope('loss'): with tf.variable_scope('surrogate'): #ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5) ratio = tf.exp( pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) surr = ratio * self.tfadv #IMPORTANT !!! self.aloss = -tf.reduce_mean( tf.minimum( surr, tf.clip_by_value(ratio, 1. - configs.epsilon, 1. + configs.epsilon) * self.tfadv)) with tf.variable_scope('atrain'): self.atrain_op = tf.train.AdamOptimizer(configs.A_LR).minimize( self.aloss) self.sess.run(tf.global_variables_initializer())
def __init__(self, n_features, action_bounds): self.n_features = n_features self.action_bounds = action_bounds self.eval_actor_net = Actor(n_features, action_bounds) self.load_weights(self.eval_actor_net) self.eval_actor_net.train() self.target_actor_net = Actor(n_features, action_bounds) self.target_actor_net.eval() self.eval_critic_net = Critic(n_features, action_bounds) self.load_weights(self.eval_critic_net) self.eval_critic_net.train() self.target_critic_net = Critic(n_features, action_bounds) self.target_critic_net.eval() self.memory = Memory(Config.MEMORY_CAPACITY) self.batch_size = Config.BATCH_SIZE self.tau = Config.REPLACEMENT_SOFT_TAU # we need a good teacher, so the teacher should learn faster than the actor self.optimizer_actor = torch.optim.Adam(self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99)) self.optimizer_critic = torch.optim.Adam(self.eval_critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.gamma = Config.REWARD_DECAY
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters. Set it at a low value ,0.001, so newer (supposedly better values if algorithm works) values take more importance # Score tracker self.score = 0. self.best_score = -np.inf
GAMMA = 0.9 # reward discount in TD error LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic env = gym.make('CartPole-v0') env.seed(1) # reproducible env = env.unwrapped n_features = env.observation_space.shape[0] n_actions = env.action_space.n sess = tf.Session() actor = Actor(sess,n_features=n_features,n_actions=n_actions,lr = LR_A) critic = Critic(sess,n_features=n_features,gamma = GAMMA,lr = LR_C) sess.run(tf.global_variables_initializer()) if OUTPUT_GRAPH: tf.summary.FileWriter("logs/", sess.graph) for i_episode in range(MAX_EPISODE): s = env.reset() t = 0 track_r = [] while True: if RENDER: env.render() a = actor.choose_action(s)