owned_squares, current_states = window.get_windows(game_map.contents, myID) moves = proto.get_action(current_states) moves = moves.numpy().tolist() moves = [Move(square, move) for square, move in zip(owned_squares, moves)] hlt.send_frame(moves) game_map.get_frame() new_states = window.get_windows_for_squares(game_map.contents, owned_squares) #rewards = [reward.reward(s) for s in new_states] rewards = [ reward.reward2(current_states[i], new_states[i]) for i in range(len(owned_squares)) ] #logging.debug(rewards) tuples = zip(current_states, moves, rewards, new_states) r.add_tuples(tuples) if len(r) >= BATCH_SIZE: proto.train2(r.get_batch(BATCH_SIZE)) EPSILON *= 0.99
class DDPG(nn.Module): def __init__( self, state_dim, action_dim, learning_rate_a=1e-3, learning_rate_c=1e-3, gamma=0.99, update_tau=1e-3, batch_size=100, buffer_size=10000, training_start=1000, ): super(DDPG, self).__init__() self.s_dim = state_dim self.a_dim = action_dim self.lr_a = learning_rate_a self.lr_c = learning_rate_c self.gamma = gamma self.update_tau = update_tau self.batch_size = batch_size self.buffer_size = buffer_size self.training_start = training_start self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') self.actor = Actor(input_dim=self.s_dim, output_dim=self.a_dim, update_tau=self.update_tau).to(self.device) self.critic = Critic(state_dim=self.s_dim, action_dim=self.a_dim, update_tau=self.update_tau).to(self.device) self.buffer = ReplayBuffer(buffer_size=self.buffer_size) self.loss_actor = 0 self.loss_critic = 0 self.optimizer_a = optim.Adam(self.actor.eval_net.parameters(), lr=self.lr_a) self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c) def choose_action(self, s): s = torch.Tensor(s).to(self.device) return self.actor.get_eval(s).to( torch.device('cpu')).detach().numpy().tolist() def percive(self, state, action, reward, state_, done): self.buffer.add(state, action, reward, state_, done) if self.training_start < self.buffer.count(): self.Train() def get_critic_loss(self, reward, state_next, state, action, done): action_next = self.actor.get_target(state_next) q_next_tar = self.critic.get_target(s=state_next, a=action_next) Q_target = reward + self.gamma * q_next_tar * (1 - done) Q_eval = self.critic.get_eval(s=state, a=action) return F.mse_loss(Q_target, Q_eval) def Train(self): minibatch = self.buffer.get_batch(batch_size=self.batch_size) state_batch = torch.Tensor([data[0] for data in minibatch]).to(self.device) action_batch = torch.Tensor([data[1] for data in minibatch]).to(self.device) reward_batch = torch.Tensor([data[2] for data in minibatch]).to(self.device) state_next_batch = torch.Tensor([data[3] for data in minibatch ]).to(self.device) done_batch = torch.Tensor([data[4] for data in minibatch]).to(self.device) #train critic self.loss_critic = self.get_critic_loss(reward_batch, state_next_batch, state_batch, action_batch, done_batch) self.optimizer_c.zero_grad() self.loss_critic.backward() self.optimizer_c.step() #train actor self.loss_actor = -self.critic.get_eval(state_batch, action_batch).mean() self.optimizer_a.zero_grad() self.loss_actor.backward() self.optimizer_a.step() #update the target net self.actor.soft_update() self.critic.soft_update()
class DDPG: """docstring for DDPG""" def __init__(self, state_space, action_dim): self.name = 'DDPG' # name for uploading results self.sess = tf.Session() # Randomly initialize actor network and critic network # with both their target networks self.state_space = state_space self.action_dim = action_dim # 1 self.ac_network = ActorCriticNetwork(self.sess, self.state_space, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Get Q target label # maxQ(s',a') q_value_batch = self.ac_network.target_q(next_state_batch) # Calculate target maxQ(s,a): y = reward + GAMMA * maxQ(s',a') y_batch = [] batch_size = len(minibatch) for i in range(batch_size): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [batch_size, 1]) # Update eval critic network by minimizing the loss L cost = self.ac_network.train_critic(y_batch, state_batch, action_batch) print('step_%d critic cost:' % self.ac_network.time_step, cost) # Update eval actor policy using the sampled gradient: self.ac_network.train_actor(state_batch) # Update the target networks self.ac_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.ac_network.actions(state) return action[0] + self.exploration_noise.noise() def action(self, state): action = self.ac_network.actions([state]) return action[0] def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def sparse_tensor(self, state_batch, state_space): row = len(state_batch) indices = [] for r in range(row): indices += [(r, c) for c in state_batch[r]] values = [1.0 for i in range(len(indices))] return tf.SparseTensorValue(indices=indices, values=values, dense_shape=[row, state_space])
class RDPG: """docstring for RDPG""" def __init__(self, env): self.name = 'RDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.saver = tf.train.Saver() def train(self): # Sample a random minibatch of N sequences from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # Construct histories observations = [] next_observations = [] actions = [] rewards = [] dones = [] for each in minibatch: for i in range(1, len(each.observations)): observations.append(self.pad(each.observations[0:i])) next_observations.append(self.pad(each.observations[1, i + 1])) actions.append(each.actions[0:i - 1]) rewards.append(each.rewards[0:i]) if i == len(each.observations) - 1: dones.append(True) else: dones.append(False) # Calculate y_batch next_action_batch = self.actor_network.target_action(observations) q_value_batch = self.critic_network.target_q( next_observations, [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)]) y_batch = [] for i in range(len(observations)): if dones[i]: y_batch.append(rewards[i][-1]) else: y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [len(observations), 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, observations, [self.pad(i) for i in actions]) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(observations) q_gradient_batch = self.critic_network.gradients( observations, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, observations) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def save_model(self, path, episode): self.saver.save(self.sess, path + "modle.ckpt", episode) def noise_action(self, history): # Select action a_t according to a sequence of observation and action action = self.actor_network.action(history) return action + self.exploration_noise.noise() def action(self, history): action = self.actor_network.action(history) return action def perceive(self, history): # Store the history sequence in the replay buffer self.replay_buffer.add(history) # Store history to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def pad(self, input): dim = len(input[0]) return input + [[0] * dim] * (1000 - len(input))
def main(args): if VERBOSE: print '***The Replay Buffer currently always returns the most recent experiences (instead of random), so the batches are constant between the tf and torch nets.' state_dim = 3 action_dim = 1 net = ActorCriticNet(state_dim, action_dim) target_net = copy.deepcopy(net) memory = ReplayBuffer(REPLAY_BUFFER_SIZE) noise = OUNoise(action_dim) criterion = nn.MSELoss() optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, weight_decay=L2) target_optim = optim.Optimizer(target_net.parameters(), {}) # to iterate over target params if VERBOSE: print '***Making gym env (only used to setup TF net).' # load tf net (restoring saved parameters) dtf = ddpg_tf.DDPG_TF(filter_env.makeFilteredEnv(gym.make('Pendulum-v0')), loadfilename='tf_params-0', printVars=False) if VERBOSE: print '***TF net restore complete.' # load control data (only using a every fourth data), and tf net results control_states = np.load('control_states.npy')[::4] control_rewards = np.load('control_rewards.npy')[::4] tf_record = np.load('tf_control_record.npy') # replace torch params with tf params, and run control data, collecting torch net results # first optimization step will occur at i == 50, upon which extra data is recorded to compare tf and torch # using: no bn, REPLAY_BUFFER_SIZE=200, REPLAY_START_SIZE=50, BATCH_SIZE=50, constant replay_buffer_batches (always the most recent experiences) replaceNetParams(dtf, net, target_net) if VERBOSE: print '***Torch net params initialized to TF net params.' original_net = copy.deepcopy(net) # save original net original_target_net = copy.deepcopy(target_net) torch_record = [] loss = -1 first_step = True for i in xrange(len(control_rewards) - 1): state = torch.from_numpy(control_states[i].reshape(1, state_dim)).float() action = net.getAction(Variable(state)).data target_action = target_net.getAction(Variable(state)).data reward = torch.FloatTensor([[control_rewards[i]]]).float() new_state = torch.from_numpy(control_states[i + 1].reshape( 1, state_dim)).float() memory.add(state, action, reward, new_state, True) if memory.count() > REPLAY_START_SIZE: minibatch = memory.get_batch(BATCH_SIZE) state_batch = torch.cat([data[0] for data in minibatch], dim=0) action_batch = torch.cat([data[1] for data in minibatch], dim=0) reward_batch = torch.cat([data[2] for data in minibatch]) next_state_batch = torch.cat([data[3] for data in minibatch], dim=0) done_batch = Tensor([data[4] for data in minibatch]) # calculate y_batch from targets #next_action_batch = target_net.getAction(Variable(next_state_batch)) value_batch = target_net.getValue(Variable(next_state_batch)).data y_batch = reward_batch + GAMMA * value_batch * done_batch if first_step: if VERBOSE: print '***First Optimization Step complete.' torch_ys = y_batch torch_batch = minibatch torch_outs = net.getValue(Variable(state_batch)).data # optimize net 1 step loss = criterion(net.getValue(Variable(state_batch)), Variable(y_batch)) optimizer.zero_grad() loss.backward() optimizer.step() loss = loss.data[0] # update targets - using exponential moving averages for group, target_group in zip(optimizer.param_groups, target_optim.param_groups): for param, target_param in zip(group['params'], target_group['params']): target_param.data.mul_(1 - TAU) target_param.data.add_(TAU, param.data) if first_step: first_step_net = copy.deepcopy(net) first_step_target_net = copy.deepcopy(target_net) first_step = False torch_record.append( [action.numpy()[0][0], target_action.numpy()[0][0], loss]) loss = -1 torch_record = np.array(torch_record) torch_outs = torch_outs.numpy().T[0] torch_ys = torch_ys.numpy().T[0] if VERBOSE: print '***Control Data run complete.' # compare torch and tf results # results for each net have 3 columns: [net action prediction, target net action prediction, loss (-1 if there was no training)] sel = np.arange(45, 55) #print calc_error(tf_record[sel,:], torch_record[sel,:]) print 'Result comparison:' print 'control_data_index | tf_net_action | tf_target_net_action | tf_loss | torch_net_action | torch_target_net_action | torch_loss' print np.hstack( [sel[:, np.newaxis], tf_record[sel, :], torch_record[sel, :]]) print '\t(a loss of -1 means no training occured in that step)' # load all tf results from before taking first optimization step tf_ys = np.load('tf_first_step_y_batch.npy') tf_rs = np.load('tf_first_step_reward_batch.npy') tf_ds = np.load('tf_first_step_done_batch.npy') tf_vs = np.load('tf_first_step_value_batch.npy') tf_outs = np.load('tf_first_step_output_values.npy') torch_wd = 1.36607 # weight decay loss of tf net at first optimization step - recorded directly from terminal output of tf net if VERBOSE: print '***Comparing first step stats' # compare tf and torch data from before taking first optimization step # including calculation of manual loss print '\terror in ys (between tf and torch)', calc_error( torch_ys, tf_ys) print '\terror in predictions (between tf and torch)', calc_error( torch_outs, tf_outs) print '\ttorch loss (manually calculated)', np.mean( (torch_ys - torch_outs)**2) print '\ttf loss (manually calculated)', np.mean((tf_ys - tf_outs)**2) print '\ttorch loss', torch_record[50, 2], '(not including weight decay)' print '\ttf loss', tf_record[ 50, 2] - torch_wd, '(not including weight decay)' return 0
class DDPG: """docstring for DDPG""" def __init__(self, env): mx.random.seed(seed) np.random.seed(seed) self.env = env if flg_gpu: self.ctx = mx.gpu(0) else: self.ctx = mx.cpu() self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.ddpgnet = DDPGNet(self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim) self.replay_buffer = ReplayBuffer(memory_size) self.batch_size = batch_size self.ddpgnet.init() self.train_step = 0 def train(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(self.batch_size) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [self.batch_size, self.action_dim]) # Calculate y_batch next_qvals = self.ddpgnet.get_target_q(next_state_batch).asnumpy() y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * next_qvals[i][0]) y_batch = np.resize(y_batch, [self.batch_size, 1]) # Update critic by minimizing the loss L self.ddpgnet.update_critic(state_batch, action_batch, y_batch) # Update actor by maxmizing Q self.ddpgnet.update_actor(state_batch) self.train_step += 1 # update target networks self.ddpgnet.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise state = np.reshape(state, (1, self.state_dim)) action = self.ddpgnet.get_step_action(state) return action + self.exploration_noise.noise() def action(self, state): state = np.reshape(state, (1, self.state_dim)) action = self.ddpgnet.get_step_action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > memory_start_size: self.train() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, state_dim, action_dim): """name for uploading resuults""" self.name = 'DDPG' self.time_step = 0 # self.atten_rate = 1 """Randomly initialize actor network and critic network""" """and both their target networks""" self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) """initialize replay buffer""" self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) """Initialize a random process the Ornstein-Uhlenbeck process for action exploration""" self.exploration_noise = OUNoise(self.action_dim) """Initialize a Treading""" self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG') def train(self): # if self.time_step ==0: # print("Begins Training!!!") #print("Training Begins") self.time_step += 1 """Sample a random minibatch of N transitions from replay buffer""" """take out BATCH_SIZE sets of data""" minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) """resize the action_batch shape to [BATCH_SIZE, self.action_dim]""" action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) """Calculate y_batch(reward)""" next_action_batch = self.actor_network.target_action(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) """Update critic by minimizing the loss L (training)""" self.critic_network.train(y_batch, state_batch, action_batch) """Update the actor policy using the sampled gradient:""" action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) """Update the target networks""" self.actor_network.update_target() self.critic_network.update_target() #print("Training Finished") def noise_action(self, state): """Select action a_t according to the current policy and exploration noise""" action = self.actor_network.action(state) exp_noise = self.exploration_noise.noise() action += exp_noise # action[0] = np.clip(action[0], 0, 1) # action[1] = np.clip(action[1], -1, 1) return action def action(self, state): action = self.actor_network.action(state) # action[0] = np.clip(action[0], 0, 1) # action[1] = np.clip(action[1], -1, 1) return action def perceive(self, state, action, reward, next_state, done): """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer""" self.replay_buffer.add(state, action, reward, next_state, done) """Store transitions to replay start size then start training""" # if self.replay_buffer.count() % 1000 == 0: # print("The buffer count is ", self.replay_buffer.count()) if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # self.atten_rate *= 0.99995 if not self.threading.is_alive(): self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG') self.threading.start() """SAVE NETWORK""" if self.time_step % 100 == 0: print("Training_time_step:", self.time_step) if self.time_step % 1000 == 0: print("!!!!!!!save model success!!!!!!!!") self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) """Re-iniitialize the random process when an episode ends""" if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, sess, data_fname): self.name = 'DDPG' # Randomly initialize actor network and critic network # with both their target networks self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = Hp.state_dim self.action_dim = Hp.action_dim print(self.state_dim, self.action_dim) self.sess = sess self.state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.target_state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.state_network = StateEnc(self.sess, self.state_input, self.target_state_input) state_batch = self.state_network.encoding next_state_batch = self.state_network.target_encoding weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters( ) state_network_params = weights + biases + [ w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 ] self.actor_network = ActorNetwork(self.sess, Hp.n_hidden, self.action_dim, self.state_input, state_batch, next_state_batch, state_network_params) self.critic_network = CriticNetwork(self.sess, Hp.n_hidden, self.action_dim, state_batch, next_state_batch) # initialize replay buffer self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname) self.summary_str2 = None # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN) print("######### TRAINING #############") for k in range(Hp.N_TRAIN): minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size] state_batch_r = np.asarray([data[0] for data in minibatch]) state_batch = [] for j in range(Hp.categories): new_cat = np.stack(state_batch_r[:, j], axis=0) state_batch.append(new_cat) #state_batch = [np.expand_dims(state_batch, axis=1)] action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch_r = np.asarray([data[3] for data in minibatch]) next_state_batch = [] for j in range(Hp.categories): new_cat = np.stack(next_state_batch_r[:, j], axis=0) next_state_batch.append(new_cat) #next_state_batch = [np.expand_dims(next_state_batch, axis=1)] done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [Hp.batch_size, self.action_dim]) next_action_batch = self.actor_network.target_actions( self.target_state_input, next_state_batch) q_value_batch = self.critic_network.target_q( self.target_state_input, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + Hp.GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [Hp.batch_size, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, self.state_input, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.state_input, state_batch) q_gradient_batch = self.critic_network.gradients( self.state_input, state_batch, action_batch_for_gradients) self.summary_str2 = self.actor_network.train( q_gradient_batch, self.state_input, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() self.state_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) print("no noise ", action) return np.clip( action + self.exploration_noise.noise() * np.array([-17.0, 17.0, 900.0]), [-35.0, 0.0, 0.0], [0.0, 35.0, 2000.0]) def action(self, state): state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > Hp.REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
] hlt.send_frame(moves) game_map.get_frame() new_targets = window.get_targets(game_map, owned_squares, directions) done = [int(t.owner == id) for t in new_targets] new_states = window.prepare_for_input(game_map, new_targets, myID) rewards = reward.reward(owned_squares, old_targets, new_targets, myID) #logging.debug(rewards) for i in range(len(owned_squares)): r.add(old_states[i], directions[i], rewards[i], new_states[i], done[i]) if len(r) >= BATCH_SIZE: batch = r.get_batch(BATCH_SIZE) loss, rewar = model.train(batch) writer.save_progress(tm.content["timesteps"], loss, rewar) #if(timestep % 10 == 0): #logging.debug(model.trainable_variables[0]) tm.content["timesteps"] += 1
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6) self.angular_noise = OUNoise(1, 0, 0.6, 0.8) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state, epsilon): action = self.actor_network.action(state) noise_t = np.zeros(self.action_dim) noise_t[0] = epsilon * self.linear_noise.noise() noise_t[1] = epsilon * self.angular_noise.noise() action = action + noise_t a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) #print(a_linear, a_angular) return [a_linear, a_angular] def action(self, state): action = self.actor_network.action(state) a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) return [a_linear, a_angular] def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) if done: self.linear_noise.reset() self.angular_noise.reset() return self.time_step
class DDQN: def __init__(self, model_name, action_dim): self.device = configure.DEVICE self.model_name = model_name self.action_dim = action_dim self.episode = 0 # self.timeStep = 0 self.STARTtrain = False self.epsilon = INITIAL_EPSILON self.img_width = configure.IMAGE_WIDTH self.img_height = configure.IMAGE_HEIGHT self.img_channels = configure.STACKED_FRAMES * 4 self.learning_rate = configure.LEARNING_RATE_START self.tau = configure.TargetNet_Tau self.replaybuffer = ReplayBuffer(REPLAY_MEMORY) self.graph = tf.Graph() with self.graph.as_default() as g: with tf.device(self.device): with tf.variable_scope('Main_net'): self.imageIn, self.conv1, self.conv2, self.conv3, self.pool1, self.conv4, \ self.Advantage, self.Value, self.Qout, self.predict \ = self.__create_graph() with tf.variable_scope('Target_net'): self.imageInT, _, _, _, _, _, _, _, self.QoutT, _ = self.__create_graph( ) self.MainNet_vars = get_variables('Main_net') self.TargetNet_vars = get_variables('Target_net') self.createTrainingMethod() self.createupdateTargetNetOp() self.sess = tf.Session( graph=self.graph, config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions(allow_growth=True))) self.sess.run(tf.global_variables_initializer()) if configure.TENSORBOARD: self._create_tensor_board() # if configure.LOAD_CHECKPOINT or configure.SAVE_MODELS: # vars = tf.global_variables() # self.saver = tf.train.Saver({var.name: var for var in vars}, max_to_keep=0) self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(self.model_name) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path mypath = str(checkpoint.model_checkpoint_path) stepmatch = re.split('-', mypath)[2] self.episode = int(stepmatch) # pass else: print "Could not find old network weights" # def __create_main_graph(self): # self.imageIn = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='imgIn') # # self.conv1 = self.conv2d_layer(self.imageIn, 8, 32, 'conv1', strides=[1, 4, 4, 1]) # self.conv2 = self.conv2d_layer(self.conv1, 4, 64, 'conv2', strides=[1, 2, 2, 1]) # self.conv3 = self.conv2d_layer(self.conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1]) # self.conv4 = self.conv2d_layer(self.conv3, self.conv3.get_shape()[1].value, 512, 'conv4', strides=[1,1,1,1]) # with tf.variable_scope('A_V'): # self.streamAC, self.streamVC = tf.split(self.conv4, 2, 3) # self.streamA = tf.contrib.layers.flatten(self.streamAC) # self.streamV = tf.contrib.layers.flatten(self.streamVC) # # self.AW = tf.Variable(tf.random_normal([self.streamA, self.action_dim]), name='AW') # self.VW = tf.Variable(tf.random_normal([self.streamV, 1]), name='VW') # self.Advantage = tf.matmul(self.streamA, self.AW, name='Advantage') # self.Value = tf.matmul(self.streamV, self.VW, name='Value') # # with tf.variable_scope('Qout'): # self.Qout = self.Value + tf.subtract( # self.Advantage, tf.reduce_mean(self.Advantage, reduction_indices=1, keep_dims=True)) # # with tf.variable_scope('Predict'): # self.predict = tf.argmax(self.Qout, 1) def __create_graph(self): imageIn = tf.placeholder( tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='imgIn') conv1 = self.conv2d_layer(imageIn, 8, 128, 'conv1', strides=[1, 4, 4, 1]) conv2 = self.conv2d_layer(conv1, 4, 128, 'conv2', strides=[1, 2, 2, 1]) conv3 = self.conv2d_layer(conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1]) pool1 = self.mpool_layer(conv3, 2, [1, 2, 2, 1], name='pool1') conv4 = self.conv2d_layer(pool1, pool1.get_shape()[1].value, 1024, 'conv4', strides=[1, 1, 1, 1], padding='VALID') streamAC, streamVC = tf.split(conv4, 2, 3) streamA = tf.contrib.layers.flatten(streamAC) streamV = tf.contrib.layers.flatten(streamVC) Advantage = self.fc_layer(streamA, self.action_dim, 'Advantage', func=None) Value = self.fc_layer(streamV, 1, 'Value', func=None) # AW = tf.Variable(tf.random_normal([streamA.get_shape()[1].value, self.action_dim]), name='AW') # VW = tf.Variable(tf.random_normal([streamV.get_shape()[1].value, 1]), name='VW') # Advantage = tf.matmul(streamA, AW, name='Advantage') # Value = tf.matmul(streamV, VW, name='Value') with tf.variable_scope('Qout'): Qout = Value + tf.subtract( Advantage, tf.reduce_mean(Advantage, reduction_indices=1, keep_dims=True)) with tf.variable_scope('Predict'): predict = tf.argmax(Qout, 1) return imageIn, conv1, conv2, conv3, pool1, conv4, Advantage, Value, Qout, predict # def __create_target_graph(self): # self.target_imageIn = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels], # name='imgIn') # self.target_conv1 = self.conv2d_layer(self.target_imageIn, 8, 32, 'conv1', strides=[1, 4, 4, 1]) # self.target_conv2 = self.conv2d_layer(self.target_conv1, 4, 64, 'conv2', strides=[1, 2, 2, 1]) # self.target_conv3 = self.conv2d_layer(self.target_conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1]) # self.target_conv4 = self.conv2d_layer(self.target_conv3, self.target_conv3.get_shape()[1].value, 512, 'conv4', strides=[1, 1, 1, 1]) # with tf.variable_scope('A_V'): # self.target_streamAC, self.target_streamVC = tf.split(self.target_conv4, 2, 3) # self.target_streamA = tf.contrib.layers.flatten(self.target_streamAC) # self.target_streamV = tf.contrib.layers.flatten(self.target_streamVC) # # self.target_AW = tf.Variable(tf.random_normal([self.target_streamA, self.action_dim]), name='AW') # self.target_VW = tf.Variable(tf.random_normal([self.target_streamV, 1]), name='VW') # self.target_Advantage = tf.matmul(self.target_streamA, self.target_AW, name='Advantage') # self.target_Value = tf.matmul(self.target_streamV, self.target_VW, name='Value') # # with tf.variable_scope('Qout'): # self.Qout = self.target_Value + tf.subtract( # self.target_Advantage, tf.reduce_mean(self.target_Advantage, reduction_indices=1, keep_dims=True)) def createTrainingMethod(self): self.global_step = tf.Variable(0, trainable=False, name='step') self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[]) self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32, name='targetQ') self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name='actions') self.actions_onehot = tf.one_hot(self.actions, self.action_dim, dtype=tf.float32, name='act_onehot') self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), reduction_indices=1, name='Q') self.td_error = tf.square(self.targetQ - self.Q, name='td_error') self.loss = tf.reduce_mean(self.td_error, name='loss') self.trainer = tf.train.AdamOptimizer( learning_rate=self.var_learning_rate) self.train_op = self.trainer.minimize(self.loss, global_step=self.global_step, name='train_update') def createupdateTargetNetOp(self): self.assign_op = {} for from_, to_ in zip(self.MainNet_vars, self.TargetNet_vars): self.assign_op[to_.name] = to_.assign(self.tau * from_ + (1 - self.tau) * to_) def updateTargetNet(self): for var in self.TargetNet_vars: self.sess.run(self.assign_op[var.name]) def conv2d_layer(self, input, filter_size, out_dim, name, strides, func=tf.nn.relu, padding='SAME'): in_dim = input.get_shape()[-1].value # in_dim = input.get_shape()[-1].value d = 1.0 / np.sqrt(filter_size * filter_size * in_dim) with tf.variable_scope(name): w_init = tf.random_uniform_initializer(-d, d) b_init = tf.random_uniform_initializer(-d, d) w = tf.get_variable( 'w', shape=[filter_size, filter_size, in_dim, out_dim], dtype=tf.float32, initializer=w_init) b = tf.get_variable('b', shape=[out_dim], initializer=b_init) output = tf.nn.conv2d(input, w, strides=strides, padding=padding) + b if func is not None: output = func(output) return output def mpool_layer(self, input_op, mpool_size, strides, name): with tf.variable_scope(name): output = tf.nn.max_pool(input_op, ksize=[1, mpool_size, mpool_size, 1], strides=strides, padding="SAME") return output def fc_layer(self, input, out_dim, name, func=tf.nn.relu): in_dim = input.get_shape()[-1].value d = 1.0 / np.sqrt(in_dim) with tf.variable_scope(name): w_init = tf.random_uniform_initializer(-d, d) b_init = tf.random_uniform_initializer(-d, d) w = tf.get_variable('w', dtype=tf.float32, shape=[in_dim, out_dim], initializer=w_init) b = tf.get_variable('b', dtype=tf.float32, shape=[out_dim], initializer=b_init) output = tf.matmul(input, w) + b if func is not None: output = func(output) return output def _create_tensor_board(self): summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) summaries.append(tf.summary.scalar("Loss", self.loss)) for var in tf.trainable_variables(): summaries.append(tf.summary.histogram("W_%s" % var.name, var)) summaries.append(tf.summary.histogram("conv1", self.conv1)) summaries.append(tf.summary.histogram("conv2", self.conv2)) summaries.append(tf.summary.histogram("conv3", self.conv3)) summaries.append(tf.summary.histogram("pool1", self.pool1)) summaries.append(tf.summary.histogram("conv4", self.conv4)) summaries.append(tf.summary.histogram("Advantage", self.Advantage)) summaries.append(tf.summary.histogram("Value", self.Value)) summaries.append(tf.summary.histogram("Qout", self.Qout)) summaries.append(tf.summary.histogram("Q", self.Q)) self.summary_op = tf.summary.merge(summaries) self.log_writer = tf.summary.FileWriter("logs/%s" % self.model_name, self.sess.graph) def log(self, y_batch, action_batch, state_batch): feed_dict = { self.targetQ: y_batch, self.actions: action_batch, self.imageIn: state_batch, self.var_learning_rate: self.learning_rate } step, summary = self.sess.run([self.global_step, self.summary_op], feed_dict=feed_dict) self.log_writer.add_summary(summary, step) def trainQNetwork(self): minibatch = self.replaybuffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) action_batch = np.resize(action_batch, [BATCH_SIZE]) A = self.sess.run(self.predict, feed_dict={self.imageIn: next_state_batch}) Q = self.sess.run(self.QoutT, feed_dict={self.imageInT: next_state_batch}) doubleQ = Q[range(BATCH_SIZE), A] targetQ = [] for i in range(len(minibatch)): if done_batch[i]: targetQ.append(reward_batch[i]) else: targetQ.append(reward_batch[i] + GAMMA * doubleQ[i]) # targetQ = np.resize(targetQ, [BATCH_SIZE, 1]) self.sess.run(self.train_op, feed_dict={ self.imageIn: state_batch, self.targetQ: targetQ, self.actions: action_batch, self.var_learning_rate: self.learning_rate }) self.updateTargetNet() if self.episode % configure.SAVE_NET == 0 and self.episode != 0: self.saver.save(self.sess, self.model_name + '/network' + '-dqn', global_step=self.episode) if configure.TENSORBOARD and self.episode % configure.TENSORBOARD_UPDATE_FREQUENCY == 0 and self.episode != 0: self.log(targetQ, action_batch, state_batch) self.episode += 1 self.STARTtrain = True def setPerception(self, nextObservation, action, reward, terminal): newState = np.concatenate( (self.currentState[:, :, 4:], nextObservation), axis=2) self.replaybuffer.add(self.currentState, action, reward, newState, terminal) # self.replayMemory.append((self.currentState, action, reward, newState, terminal)) if self.episode <= OBSERVE: state = "observe" elif self.episode > OBSERVE and self.episode <= OBSERVE + EXPLORE: state = "explore" else: state = "train" if self.episode % 100 == 0 and self.STARTtrain: print "episode", self.episode , "/ STATE", state, \ "/ EPSILON", self.epsilon self.currentState = newState def Perce_Train(self): if self.replaybuffer.count() > configure.REPLAY_START_SIZE: self.trainQNetwork() def getAction(self): if np.random.rand(1) < self.epsilon: action_get = np.random.randint(0, self.action_dim) else: action_get = self.sess.run( self.predict, feed_dict={self.imageIn: [self.currentState]}) if self.epsilon > FINAL_EPSILON and self.episode > OBSERVE: self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE return action_get def setInitState_rgb(self, observation): self.currentState = observation for i in xrange(configure.STACKED_FRAMES - 1): self.currentState = np.concatenate( (self.currentState, observation), axis=2)
class DDPG(object): def __init__(self, a_dim, s_dim, a_bound, m_dim, pixel_meter, att_dim): self.time_step = 1 self.memory = ReplayBuffer(MEMORY_CAPACITY) self.exploration_noise = OUNoise(a_dim) self.pointer = 0 self.sess = tf.Session() writer = tf.summary.FileWriter("logs/", self.sess.graph) self.a_dim, self.s_dim, self.a_bound, self.m_dim, self.pixel_meter, self.att_dim = \ a_dim, s_dim, a_bound, m_dim, pixel_meter, att_dim self.S = tf.placeholder(tf.float32, [None, s_dim], 's') self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') self.GM = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'gm') self.LM = tf.placeholder(tf.int32, [None, att_dim*2+1, att_dim*2+1, 4], 'lm') self.LM_ = tf.placeholder(tf.int32, [None, att_dim*2+1, att_dim*2+1, 4], 'lm_') self.a = self._build_a(self.S, self.GM, self.LM, ) q = self._build_c(self.S, self.GM, self.LM, self.a, ) a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor') c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic') ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement def ema_getter(getter, name, *args, **kwargs): return ema.average(getter(name, *args, **kwargs)) target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation a_ = self._build_a(self.S_, self.GM, self.LM_, reuse=True, custom_getter=ema_getter) # replaced target parameters q_ = self._build_c(self.S_, self.GM, self.LM_, a_, reuse=True, custom_getter=ema_getter) a_loss = - tf.reduce_mean(q) # maximize the q self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params) with tf.control_dependencies(target_update): # soft replacement happened at here q_target = self.R + GAMMA * q_ td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params) self.sess.run(tf.global_variables_initializer()) def noise_action(self, s1, gm1, loc1): locm = np.zeros([1, self.att_dim*2+1, self.att_dim*2+1, 4]) for j in range(self.att_dim * 2 + 1): for k in range(self.att_dim * 2 + 1): locm[0, j, k, :] = np.array([0, loc1[0] - self.att_dim + j, loc1[1] - self.att_dim + k, 0]) return self.sess.run(self.a, {self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis], self.LM: locm})[0] + self.exploration_noise.noise() def action(self, s1, gm1, loc1): locm = np.zeros([1, self.att_dim * 2 + 1, self.att_dim * 2 + 1, 4]) for j in range(self.att_dim * 2 + 1): for k in range(self.att_dim * 2 + 1): locm[0, j, k, :] = np.array([0, loc1[0] - self.att_dim + j, loc1[1] - self.att_dim + k, 0]) return self.sess.run(self.a, {self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis], self.LM: locm})[0] def perceive(self, sd, p, loc, s, a_store, r, s_, loc_, done): self.memory.add(sd, p, loc, s, a_store, r, s_, loc_, done) if self.memory.count() > REPLAY_START: self.learn() if self.time_step % 500000 == 0: self.save_network() def learn(self): self.time_step += 1 replay = self.memory.get_batch(BATCH_SIZE) bm_sd = np.asarray([data[0] for data in replay]) bp = np.asarray([data[1] for data in replay]) bloc = np.asarray([data[2] for data in replay]) bs = np.asarray([data[3] for data in replay]) ba = np.asarray([data[4] for data in replay]) br = np.reshape(np.asarray([data[5] for data in replay]), [-1, 1]) bs_ = np.asarray([data[6] for data in replay]) bloc_ = np.asarray([data[7] for data in replay]) bgm = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1]) for batch in range(BATCH_SIZE): sd1 = bm_sd[batch] terrian_map = grid_map(sd1, self.m_dim, self.pixel_meter, bp[batch]) bgm[batch, :, :, 0] = terrian_map.map_matrix blocm = np.zeros([BATCH_SIZE, self.att_dim*2+1, self.att_dim*2+1, 4]) blocm_ = np.zeros([BATCH_SIZE, self.att_dim * 2 + 1, self.att_dim * 2 + 1, 4]) for i in range(BATCH_SIZE): for j in range(self.att_dim*2+1): for k in range(self.att_dim*2+1): blocm[i, j, k, :] = np.array([i, bloc[i, 0]-self.att_dim+j, bloc[i, 1]-self.att_dim+k, 0]) blocm_[i, j, k, :] = np.array([i, bloc_[i, 0] - self.att_dim + j, bloc_[i, 1] - self.att_dim + k, 0]) self.sess.run(self.atrain, {self.S: bs, self.GM: bgm, self.LM: blocm}) self.sess.run(self.ctrain, {self.GM: bgm, self.S: bs, self.LM: blocm, self.a: ba, self.R: br, self.S_: bs_, self.LM_: blocm_}) def _build_a(self, s, gm, locm, reuse=None, custom_getter=None): def _conv2d_keep_size(x, y, kernel_size, name, use_bias=False, reuse_conv=None, trainable_conv=True): return tf.layers.conv2d(inputs=x, filters=y, kernel_size=kernel_size, padding="same", use_bias=use_bias, kernel_initializer=tf.truncated_normal_initializer(stddev=0.01), bias_initializer=tf.truncated_normal_initializer(stddev=0.01), reuse=reuse_conv, name=name, trainable=trainable_conv) def _build_vin(mat, name, reuse, trainable_vin): h1 = _conv2d_keep_size(mat, 150, 3, name+"_h1", use_bias=True, reuse_conv=reuse, trainable_conv=trainable_vin) r = _conv2d_keep_size(h1, 1, 1, name+"_r", reuse_conv=reuse, trainable_conv=trainable_vin) q0 = _conv2d_keep_size(r, 10, 9, name+"_q0", reuse_conv=reuse, trainable_conv=trainable_vin) v = tf.reduce_max(q0, axis=3, keep_dims=True, name=name+"_v") rv = tf.concat([r, v], axis=3) q = _conv2d_keep_size(rv, 10, 9, name + "_q", reuse_conv=False, trainable_conv=trainable_vin) v = tf.reduce_max(q, axis=3, keep_dims=True, name=name + "_v") for k in range(30): rv = tf.concat([r, v], axis=3) q = _conv2d_keep_size(rv, 10, 9, name+"_q", reuse_conv=True, trainable_conv=trainable_vin) v = tf.reduce_max(q, axis=3, keep_dims=True, name=name+"_v") return v trainable = True if reuse is None else False with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter): gv = _build_vin(gm, name="global_map_vin", reuse=reuse, trainable_vin=trainable) att = tf.reshape(tf.gather_nd(gv, locm), [-1, (self.att_dim*2+1)**2]) layer_1 = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable) layer_2a = tf.layers.dense(layer_1, 600, name='l2a', trainable=trainable) layer_2att = tf.layers.dense(att, 600, name='l2att', trainable=trainable) layer_2 = tf.add(layer_2a, layer_2att, name="l2") layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable) a = tf.layers.dense(layer_3, 7, activation=tf.nn.tanh, name='a1', trainable=trainable) return a def _build_c(self, s, gm, loc, a, reuse=None, custom_getter=None): trainable = True if reuse is None else False with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter): gm_flat = tf.reshape(gm, [-1, self.m_dim**2]) layer_gm = tf.layers.dense(gm_flat, self.s_dim, activation=tf.nn.relu, name='lgm', trainable=trainable) s_all = tf.concat([layer_gm, s], axis=1) layer_1 = tf.layers.dense(s_all, 300, activation=tf.nn.relu, name='l1', trainable=trainable) layer_2s = tf.layers.dense(layer_1, 600, activation=None, name='l2s', trainable=trainable) layer_2a = tf.layers.dense(a, 600, activation=None, name='l2a', trainable=trainable) layer_2 = tf.add(layer_2s, layer_2a, name="l2") layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable) return tf.layers.dense(layer_3, 1, trainable=trainable) # Q(s,a) def save_network(self): self.saver = tf.train.Saver() print("save ddpg-network...", self.time_step) self.saver.save(self.sess, 'saved_ddpg_networks/' + "ddpg-network", global_step=self.time_step) def load_network(self): self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_ddpg_networks") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights")
class Algorithm: def __init__(self): self.replay_buffer = ReplayBuffer(buffer_size=BUFFER_MAX, past_frame_len=FRAME_SKIP, multi_step=N_STEP) # Intial def Initial(self): # Initail your session or somethingself.target_net # restore neural net parameters # self.buffer_size = 0 self.ctx = try_gpu(GPU_INDEX) self.frame_cnt = 0 self.train_count = 0 self.loss_sum = 0 self.q_count = 0 self.q_sum = 0 self.dtype = DTYPE INPUT_SAMPLE = nd.random_uniform(0,1,(1, FRAME_SKIP, 11), self.ctx, self.dtype) self.target_net = self.get_net(INPUT_SAMPLE) self.policy_net = self.get_net(INPUT_SAMPLE) if MODEL_FILE is not None: print('%s: read trained results from [%s]' % (tm.strftime("%Y-%m-%d %H:%M:%S"), MODEL_FILE)) self.policy_net.load_params(MODEL_FILE, ctx=self.ctx) self.update_target_net() # adagrad self.trainer = Trainer(self.policy_net.collect_params(), optimizer=mx.optimizer.RMSProp(LEARNING_RATE, 0.95, 0.95)) self.loss_func = loss.L2Loss() self.epsilon = EPSILON_START self.epsilon_min = EPSILON_MIN self.epsilon_rate = (EPSILON_START - EPSILON_MIN) / EPSILON_DECAY self.rng = np.random.RandomState(int(time() * 1000) % 100000000) def update_target_net(self): self.copy_params(self.policy_net, self.target_net) return def calculate_reward(self,end_of_video,cdn_flag,rebuf,end_delay,skip_frame_time_len,decision_flag,bitrate,last_bitrate,frame_time_len): if end_of_video <= 1.0: LANTENCY_PENALTY = 0.005 else: LANTENCY_PENALTY = 0.01 if not cdn_flag: reward_frame = frame_time_len * float(BIT_RATE[ bitrate]) / 1000 - REBUF_PENALTY * rebuf - LANTENCY_PENALTY * end_delay - SKIP_PENALTY * skip_frame_time_len else: reward_frame = -(REBUF_PENALTY * rebuf) if decision_flag or end_of_video: reward_frame += -1 * SMOOTH_PENALTY * (abs(BIT_RATE[bitrate] - BIT_RATE[last_bitrate]) / 1000) return reward_frame def run_frame(self,time, time_interval, send_data_size, chunk_len, \ rebuf, buffer_size, play_time_len, end_delay, \ cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len, decision_flag, \ buffer_flag, cdn_flag, skip_flag, end_of_video,action,last_action,frame_time_len): bitrate, target_buffer, latency = self.action_to_submit(action) last_bitrate,_,_ = self.action_to_submit(last_action) reward_frame = self.calculate_reward(end_of_video,cdn_flag,rebuf,end_delay,skip_frame_time_len,decision_flag,bitrate,last_bitrate,frame_time_len) self.replay_buffer.insert_sample(time_interval, send_data_size, chunk_len, rebuf, buffer_size, play_time_len,end_delay, cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len,decision_flag, buffer_flag, cdn_flag, skip_flag, end_of_video, reward_frame, action) st = self.replay_buffer.get_current_state() st = nd.array(st, ctx=self.ctx, dtype=self.dtype).reshape((1, FRAME_SKIP, -1)) action, max_q = self.choose_action(False, False, st) bit_rate, target_buffer, latency_limit = self.action_to_submit(action) self.frame_cnt += 1 if max_q is not None: self.q_count += 1 self.q_sum += max_q if self.frame_cnt % TRAIN_PER_STEP == 0: state, s_, actions, rewards = self.replay_buffer.get_batch(16) loss = self.train_policy_net(state, actions, rewards, s_) self.train_count += 1 self.loss_sum += loss # fixme 视频结束的时候是否需要清零 if end_of_video: average_loss = self.loss_sum / (self.train_count + 0.0001) average_q = self.q_sum / (self.q_count + 0.000001) self.loss_sum = 0 self.train_count = 0 self.q_count = 0 self.q_sum = 0 else: average_loss = 0 average_q = 0 return reward_frame,bit_rate, target_buffer, latency_limit,action,average_loss,average_q def train_policy_net(self,states,actions,rewards,next_states): batch_size = actions.shape[0] s = states.shape states = nd.array(states,ctx=self.ctx,dtype=self.dtype) actions = nd.array(actions[:,0],ctx=self.ctx) rewards = nd.array(rewards[:,0],ctx=self.ctx) next_states = nd.array(next_states,ctx=self.ctx,dtype=self.dtype) next_qs = self.target_net(next_states) next_q_out = nd.max(next_qs,axis=1) target = rewards + next_q_out * 0.99 ** MULTI_STEP with autograd.record(): current_qs = self.policy_net(states) current_q = nd.pick(current_qs,actions,1) loss = self.loss_func(target,current_q) loss.backward() self.trainer.step(16) total_loss = loss.mean().asscalar() return total_loss def save_params_to_file(self,model_path,mark): time_mark = tm.time() filename = model_path + '/net_' + str(mark) + '_' + str(time_mark) + '.model' self.policy_net.save_params(filename) print(tm.strftime(TIME_FORMAT), 'save results success:',filename) files = getNewestFile(model_path) if len(files) > 5: tmp = files[5:] for f in tmp: if os.path.exists(model_path + "/" + f): os.remove(model_path + "/" + f) print(f + "is deleted.") def get_net(self, input_sample): if IS_DUELING: net = dueling_dqn.DuelingDQN() net.initialize(init.Xavier(), ctx=self.ctx) else: net = dueling_dqn.OriginDQN() net.initialize(init.Xavier(), ctx=self.ctx) net(input_sample) return net def choose_action(self, random_action, testing, st): self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_rate) max_q = None random_num = self.rng.rand() if random_action or ((not testing) and random_num < self.epsilon): action = self.rng.randint(0,ACTION_NUM) else: out = self.policy_net(st) max_index = nd.argmax(out, axis=1) action = int(max_index.astype(np.int).asscalar()) max_q = out[0, action].asscalar() return action, max_q def action_to_submit(self,action): bit_rate = action % 4 target_buffer = action // 4 latency_limit = 4 return bit_rate, target_buffer, latency_limit #Define your al def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len, S_end_delay, S_decision_flag, S_buffer_flag,S_cdn_flag,S_skip_time, end_of_video, cdn_newest_id,download_id,cdn_has_frame,IntialVars): # state = np.empty(shape=(len(S_time_interval),11),dtype=np.float32) S_end_of_video = [0] * FRAME_SKIP S_end_of_video[-1] = end_of_video state = [S_time_interval[-FRAME_SKIP:],S_send_data_size[-FRAME_SKIP:],S_chunk_len[-FRAME_SKIP:], S_buffer_size[-FRAME_SKIP:], S_rebuf[-FRAME_SKIP:], S_end_delay[-FRAME_SKIP:], S_play_time_len[-FRAME_SKIP:],S_decision_flag[-FRAME_SKIP:], S_cdn_flag[-FRAME_SKIP:],S_skip_time[-FRAME_SKIP:],S_end_of_video] state = nd.array(state,dtype=self.dtype).transpose((1,0)).reshape((1,FRAME_SKIP,-1)) # print(state.shape) action, max_q = self.choose_action(False,True,state) # print(action) bit_rate, target_buffer, latency_limit = self.action_to_submit(action) print(bit_rate, target_buffer, latency_limit) return bit_rate, target_buffer, latency_limit def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len, S_end_delay, S_decision_flag, S_buffer_flag, S_cdn_flag, S_skip_time, end_of_video, cdn_newest_id, download_id, cdn_has_frame, IntialVars): # If you choose the marchine learning '''state = [] state[0] = ... state[1] = ... state[2] = ... state[3] = ... state[4] = ... decision = actor.predict(state).argmax() bit_rate, target_buffer = decison//4, decison % 4 ..... return bit_rate, target_buffer''' # If you choose BBA RESEVOIR = 0.5 CUSHION = 1.5 if S_buffer_size[-1] < RESEVOIR: bit_rate = 0 elif S_buffer_size[-1] >= RESEVOIR + CUSHION and S_buffer_size[-1] < CUSHION + CUSHION: bit_rate = 2 elif S_buffer_size[-1] >= CUSHION + CUSHION: bit_rate = 3 else: bit_rate = 1 target_buffer = 0 latency_limit = 4 return bit_rate, target_buffer, latency_limit def get_params(self): # get your params your_params = [] return your_params def copy_params(self, src_net, dst_net): ps_src = src_net.collect_params() ps_dst = dst_net.collect_params() prefix_length = len(src_net.prefix) for k, v in ps_src.items(): k = k[prefix_length:] v_dst = ps_dst.get(k) v_dst.set_data(v.data())
else: # Choose best action a = np.argmax(q_values) # Perform choosen action next_state, reward, done, _ = env.step(a) episode_reward += reward episode_steps += 1 # Insert into replay buffer repbuf.add_sample((state, a, reward, next_state, done)) state = next_state # Stats total_max_q += q_values.max() # Check if we need to train if step % STEPS_TO_TRAIN == 0: # Get a batch from replaybuffer batch = repbuf.get_batch(BATCH_SIZE) state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch) pred_nextQ = sess.run(target_dqn.logits, feed_dict={target_dqn.input: next_state_batch}) max_nextQ = np.max(pred_nextQ, axis=1) pred_values = np.array(reward_batch) + np.invert(done_batch).astype('float32') * GAMMA * max_nextQ cost = dqn.train(state_batch, action_batch, pred_values, sess) elif FLAGS.mode == 'test': # Testing mode epsilon = 0.05 rewards = [] for _ in trange(100): done = False obs = env.reset() reward = 0
class DDPG: def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights") def train(self): # my_config.logger.debug("......enter tain......") # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) noise = self.exploration_noise.noise(action) # if random.random() <= 0.5: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5]) # else: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75]) noise_action = action + noise clipped_noise_action = np.clip(noise_action, 0, 1) # if (self.time_step < 5): # my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action)) return clipped_noise_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): # my_config.logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'ltr') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) return self.time_step
class DDPG: """docstring for DDPG""" def __init__(self): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 12 self.action_dim = 10 self.has_kicked = False self.laststep_haskicked = False self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.saver = tf.train.Saver(max_to_keep=1) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # print(minibatch) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) # print(q_value_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) with open('/home/ruizhao/Desktop/a.txt', 'a') as f: print("action_batch[0]", file=f) print(action_batch[0], file=f) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) with open('/home/ruizhao/Desktop/a.txt', 'a') as f: print("q_gradient_batch[0]", file=f) print(q_gradient_batch[0], file=f) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action2(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def noise_action(self, state): action = self.actor_network.action(state) random_action = np.zeros(10, float) random_action[random.randint(0, 3)] = 1 random_action[4] = random.uniform(-100, 100) #DASH POWER random_action[5] = random.uniform(-180, 180) #DASH DEGREES random_action[6] = random.uniform(-180, 180) #TURN DEGREES random_action[7] = random.uniform(-180, 180) #TACKLE DEGREES random_action[8] = random.uniform(0, 100) #KICK POWER random_action[9] = random.uniform(-180, 180) #KICK DEGREES if np.random.uniform() < EPSILON: return action else: return random_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class MaDDPG: def __init__(self, num_agents, state_dim, action_dim): # track training times self.time_step = 0 # use set session use GPU #self.sess = tf.InteractiveSession() self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=True)) self.num_agents = num_agents self.state_dim = state_dim self.action_dim = action_dim self.agents = self.create_multi_agents(self.sess, num_agents, self.state_dim, self.action_dim) # make sure create Criticnetwork later, summarise mean Q value inside self.critic = CriticNetwork(self.sess, state_dim, action_dim) self.exploration_noise = OUNoise((self.num_agents, action_dim)) self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # for store checkpoint self.saver = tf.train.Saver() def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.zeros((BATCH_SIZE, self.num_agents, self.state_dim)) action_batch = np.zeros((BATCH_SIZE, self.num_agents, self.action_dim)) reward_batch = np.zeros((BATCH_SIZE, self.num_agents)) next_state_batch = np.zeros( (BATCH_SIZE, self.num_agents, self.state_dim)) done_batch = np.zeros((BATCH_SIZE)) for ii in range(BATCH_SIZE): state_batch[ii, :, :] = minibatch[ii][0] action_batch[ii, :, :] = minibatch[ii][1] reward_batch[ii, :] = minibatch[ii][2] next_state_batch[ii, :, :] = minibatch[ii][3] done_batch[ii] = minibatch[ii][4] # calculate Gt batch next_action_batch = self.target_actions(next_state_batch) q_value_batch = self.critic.target_q(next_state_batch, next_action_batch) gt = np.zeros((BATCH_SIZE, self.num_agents)) for ii in range(BATCH_SIZE): if done_batch[ii]: gt[ii, :] = reward_batch[ii, :] else: gt[ii, :] = reward_batch[ii, :] + GAMMA * q_value_batch[ii, :] #update critic by minimizing the loss self.critic.train(gt, state_batch, action_batch) # update policy using the sampling gradients actions_for_grad = self.actions(state_batch) q_gradients_batch = self.critic.gradients(state_batch, actions_for_grad) self.train_agents(q_gradients_batch, state_batch) # update critic target network self.critic.update_target() # update actor target self.update_agents_target() def summary(self, record_num): if self.replay_buffer.count() > SUMMARY_BATCH_SIZE: mini_batch = self.replay_buffer.popn(SUMMARY_BATCH_SIZE) state_batch = np.zeros( (SUMMARY_BATCH_SIZE, self.num_agents, self.state_dim)) for ii in range(SUMMARY_BATCH_SIZE): state_batch[ii, :, :] = mini_batch[ii][0] actions_for_summary = self.actions(state_batch) self.critic.write_summaries(state_batch, actions_for_summary, record_num) def update_agents_target(self): for agent in self.agents: agent.update_target() def train_agents(self, gradients_batch, state_batch): # gradients_batch = [batchsize* agents* action_dim] # state_batch = [batchsize* agents * state_dim ] for ii in range(self.num_agents): grad = gradients_batch[:, ii, :] state = state_batch[:, ii, :] self.agents[ii].train(grad, state) def create_multi_agents(self, sess, num_agents, state_dim, action_dim): agents = [] nets = None for ii in range(num_agents): agent_name = 'agent' + str(ii) agents.append( ActorNetwork(sess, state_dim, action_dim, agent_name, nets)) nets = agents[-1].nets return agents def add_agents(self, add_num): for ii in range(add_num): #self.num_agents+=1 agent_name = 'agent' + str(self.num_agents) self.agents.append( ActorNetwork(self.sess, self.state_dim, self.action_dim, agent_name, self.agents[-1].nets)) # the agents' name is from 0-num_agents-1 self.num_agents += 1 # if add a new agent then reset the noise and replay buffer self.exploration_noise = OUNoise((self.num_agents, self.action_dim)) #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.replay_buffer.erase() # re-create a saver # the new saver will contains all the savable variables. # otherwise only contains the initially created agents self.saver = tf.train.Saver() # reset the time step # self.time_step = 0 def action( self, state ): # here is action, for one state on agent, not batch_sized actions # state = [num_agents * state_dim] # actions = [num_agents * action_dim] action = np.zeros((self.num_agents, self.action_dim)) for ii in range(self.num_agents): action[ii, :] = self.agents[ii].action(state[ii, :]) return action def actions(self, state_batch): #state = batch_size*numOfagents*state_dim #actions = batch_size*numOfagents*action_dim batch_size = state_batch.shape[0] actions = np.zeros((batch_size, self.num_agents, self.action_dim)) for ii in range(self.num_agents): actions[:, ii, :] = self.agents[ii].actions(state_batch[:, ii, :]) return actions def target_actions(self, state_batch): # the state size is batch_size* num_agents * state_dimension actions = np.zeros( (state_batch.shape[0], self.num_agents, self.action_dim)) for ii in range(self.num_agents): actions[:, ii, :] = self.agents[ii].target_actions(state_batch[:, ii, :]) return actions def noise_action(self, state): action = self.action(state) # clip the action, action \in [-1,+1] return np.clip(action + self.exploration_noise.noise(), -1, 1) def close_session(self): self.sess.close() def perceive(self, state, action, reward, next_state, done): # store {st,at,Rt+1,st+1} self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % SAVE_STEPS == 0: self.save_network() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-initialize the random process when an episode ends if done: self.exploration_noise.reset() def load_network(self): checkpoint = tf.train.get_checkpoint_state("saved_network") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print('Could not find old network weights') def save_network(self): # do not processing under Dropbox # exit drop box then run print('save network...', self.time_step) self.saver.save(self.sess, 'saved_network/' + 'network', global_step=self.time_step)
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG(object): def __init__(self, a_dim, s_dim, a_bound, m_dim, att_dim): self.memory = ReplayBuffer(MEMORY_CAPACITY) self.pointer = 0 self.sess = tf.Session() self.a_dim, self.s_dim, self.a_bound, self.m_dim, self.att_dim = a_dim, s_dim, a_bound, m_dim, att_dim self.S = tf.placeholder(tf.float32, [None, s_dim], 's') self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') self.GM = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'r') self.LM = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'l') self.LM_ = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'l') self.a = self._build_a( self.S, self.GM, self.LM, ) q = self._build_c( self.S, self.GM, self.LM, self.a, ) a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor') c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic') ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement def ema_getter(getter, name, *args, **kwargs): return ema.average(getter(name, *args, **kwargs)) target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation a_ = self._build_a( self.S_, self.GM, self.LM_, reuse=True, custom_getter=ema_getter) # replaced target parameters q_ = self._build_c(self.S_, self.GM, self.LM_, a_, reuse=True, custom_getter=ema_getter) a_loss = -tf.reduce_mean(q) # maximize the q self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params) with tf.control_dependencies( target_update): # soft replacement happened at here q_target = self.R + GAMMA * q_ td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.ctrain = tf.train.AdamOptimizer(LR_C).minimize( td_error, var_list=c_params) self.sess.run(tf.global_variables_initializer()) def choose_action(self, s1, gm1, lm1): return self.sess.run( self.a, { self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis], self.LM: lm1[np.newaxis, :, :, np.newaxis] })[0] def learn(self): replay = self.memory.get_batch(BATCH_SIZE) bm_sd = np.asarray([data[0] for data in replay]) bs = np.asarray([data[1] for data in replay]) bloc = np.asarray([data[2] for data in replay]) ba = np.asarray([data[3] for data in replay]) br = np.asarray([data[4] for data in replay]) bs_ = np.asarray([data[5] for data in replay]) bloc_ = np.asarray([data[6] for data in replay]) bgm = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1]) blm = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1]) blm_ = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1]) for batch in range(BATCH_SIZE): sd1 = bm_sd[batch] terrian_map = TerrainMap(sd1, MAP_DIM, GLOBAL_PIXEL_METER) bgm[batch, :, :, 0] = terrian_map.map_matrix blm[batch, :, :, 0] = terrian_map.get_local_map(bloc[batch, :]) blm_[batch, :, :, 0] = terrian_map.get_local_map(bloc_[batch, :]) self.sess.run(self.atrain, {self.S: bs, self.GM: bgm, self.LM: blm}) self.sess.run(self.ctrain, { self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.LM_: blm_ }) def _build_a(self, s, gm, locm, reuse=None, custom_getter=None): def _conv2d_keep_size(x, y, kernel_size, name, use_bias=False, reuse_conv=None, trainable_conv=True): return tf.layers.conv2d( inputs=x, filters=y, kernel_size=kernel_size, padding="same", use_bias=use_bias, kernel_initializer=tf.truncated_normal_initializer( stddev=0.01), bias_initializer=tf.truncated_normal_initializer(stddev=0.01), reuse=reuse_conv, name=name, trainable=trainable_conv) def _build_vin(mat, name, trainable_vin): h1 = _conv2d_keep_size(mat, 150, 5, name + "_h1", use_bias=True, trainable_conv=trainable_vin) r = _conv2d_keep_size(h1, 1, 1, name + "_r", trainable_conv=trainable_vin) q0 = _conv2d_keep_size(r, 10, 5, name + "_q0", trainable_conv=trainable_vin) v = tf.reduce_max(q0, axis=3, keep_dims=True, name=name + "_v") for k in range(30): rv = tf.concat([r, v], axis=3) q = _conv2d_keep_size(rv, 10, 5, name + "_q", reuse_conv=tf.AUTO_REUSE, trainable_conv=trainable_vin) v = tf.reduce_max(q, axis=3, keep_dims=True, name=name + "_v") return v trainable = True if reuse is None else False with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter): gv = _build_vin(gm, name="global_map_vin", trainable_vin=trainable) loc_co = _conv2d_keep_size(locm, 1, 9, name="local_co", use_bias=False, trainable_conv=trainable) lv = tf.multiply(gv, loc_co) m_flat = tf.reshape(lv, [-1, self.m_dim**2]) att = tf.layers.dense(m_flat, self.att_dim, name='att_l1', trainable=trainable) layer_1 = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable) layer_2a = tf.layers.dense(layer_1, 600, name='l2a', trainable=trainable) layer_2att = tf.layers.dense(att, 600, name='l2att', trainable=trainable) layer_2 = tf.add(layer_2a, layer_2att, name="l2") layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable) a1 = tf.layers.dense(layer_3, 4, activation=tf.nn.tanh, name='a1', trainable=trainable) a1_norm = tf.nn.l2_normalize(a1, dim=-1) a2 = tf.layers.dense(layer_3, 3, activation=tf.nn.tanh, name='a2', trainable=trainable) a = tf.concat([a1_norm, a2], axis=-1) return tf.multiply(a, self.a_bound, name='scaled_a') def _build_c(self, s, gm, locm, a, reuse=None, custom_getter=None): trainable = True if reuse is None else False with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter): gm_flat = tf.reshape(gm, [-1, self.m_dim**2]) layer_gm = tf.layers.dense(gm_flat, self.s_dim, activation=tf.nn.relu, name='lgm', trainable=trainable) lm_flat = tf.reshape(locm, [-1, self.m_dim**2]) layer_lm = tf.layers.dense(lm_flat, self.s_dim, activation=tf.nn.relu, name='llm', trainable=trainable) s_all = tf.concat([layer_gm, layer_lm, s], axis=0) layer_1 = tf.layers.dense(s_all, 300, activation=tf.nn.relu, name='l1', trainable=trainable) layer_2s = tf.layers.dense(layer_1, 600, activation=None, name='l2s', trainable=trainable) layer_2a = tf.layers.dense(a, 600, activation=None, name='l2a', trainable=trainable) layer_2 = tf.add(layer_2s, layer_2a, name="l2") layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable) return tf.layers.dense(layer_3, 1, trainable=trainable) # Q(s,a)
class DDPG: def __init__(self, env): self.name = 'DDPG' self.environment = env self.episode = 0 self.epsilon = 0.98 self.one_number = 1 self.mean = [] self.state_dim = len(obs2state(env.reset().observation)) self.action_dim = env.action_spec().shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) self.critic_network.train(y_batch, state_batch, action_batch) action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): action = self.actor_network.action(state) exp = self.exploration_noise.noise() t = action * exp return exp def action(self, state): if np.random.rand() <= self.epsilon: act = self.noise_action(state) z = array(act) else: action = self.actor_network.action(state) z = array(action) self.mean.append(z[0]) g = np.tanh(z) return g def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() if self.epsilon > 0.1: self.epsilon *= 0.99999 if done: self.exploration_noise.reset()
class DDPG: def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return def train(self): action_dim = self.action_dim minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # sample BATCH_SIZE from replay_buffer state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # if action_dim = 1, it's a number not a array action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim]) # calculate y_batch via target network next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch) y_batch = [] for i in range(BATCH_SIZE): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # print np.shape(reward_batch), np.shape(y_batch) # train actor network self.actor_network.train(state_batch) # train critic network self.critic_network.train(y_batch, state_batch, action_batch) # update target network self.actor_network.update_target() self.critic_network.update_target() return def noise_action(self, state): action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def _record_log(self, reward, living_time): summary_str = self.sess.run(self.summary_op, feed_dict={ self.reward_input: reward, self.time_input: living_time }) self.summary_writer.add_summary(summary_str, self.time_step) return def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.episode_start_time == 0.0: self.episode_start_time = time.time() # for testing # self.time_step += 1 # if self.time_step == 100: # print '--------------------------------' # self.replay_buffer.save_to_pickle() # return self.episode_reward += reward living_time = time.time() - self.episode_start_time if self.time_step % 1000 == 0 or done: self._record_log(self.episode_reward, living_time) if self.replay_buffer.size() > REPLAY_START_SIZE: self.train() if self.time_step % 100000 == 0: self.save_network() if done: print '===============reset noise=========================' self.exploration_noise.reset() self.episode_reward = 0.0 self.episode_start_time = time.time() self.time_step += 1 return def load_time_step(self): if not os.path.exists(self.dir_path): return files = os.listdir(self.dir_path) step_list = [] for filename in files: if ('meta' in filename) or ('-' not in filename): continue step_list.append(int(filename.split('-')[-1])) step_list = sorted(step_list) if len(step_list) == 0: return self.time_step = step_list[-1] + 1 return def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.dir_path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print 'Successfully loaded:', checkpoint.model_checkpoint_path else: print 'Could not find old network weights' return def save_network(self): print 'save actor-critic network...', self.time_step self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step) return
class DDPG(object): def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env self.epsilon_expert_range = (1.0, 0.1) self.epsilon_expert = self.epsilon_expert_range[0] self.epsilon_random_range = (0.1, 0.01) self.epsilon_random = self.epsilon_random_range[0] # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] self.state_dim = 16 # self.action_dim = env.action_space.shape[0] self.action_dim = 3 self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) # self.exploration_noise = OUNoise() self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path self.saver.restore(self.sess, path) self.time_step = int(path[path.rindex('-') + 1:]) self.epsilon_expert -= ( self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= ( self.epsilon_random_range[0] - self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.warn( "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s" % (path, self.time_step, self.epsilon_expert, self.epsilon_random)) else: logger.warn("Could not find old network weights") self.critic_cost = 0 def train(self): self.time_step = self.time_step + 1 self.epsilon_expert -= (self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= (self.epsilon_random_range[0] - self.epsilon_random_range[1]) / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.debug( "step: %d, epsilon_expert: %s, epsilon_random: %s" % (self.time_step, self.epsilon_expert, self.epsilon_random)) # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # if done_batch[i]: # y_batch.append(reward_batch[i]) # else : # y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_cost = self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() # def noise_action(self,state): # # Select action a_t according to the current policy and exploration noise # action = self.actor_network.action(state) # noise = self.exploration_noise.noise(action) # noise_action = action + noise # clipped_noise_action = np.clip(noise_action, 0, 1) # return clipped_noise_action # def noise_action(self,state): # # Select action a_t according to the current policy and exploration noise # action = self.actor_network.action(state) # noise = np.zeros(self.action_dim) # noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10) # noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10) # noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10) # noise_action = action + noise # logger.debug("action: %s, noise: %s" % (action, noise)) # clipped_noise_action = np.clip(noise_action, 0, 1) # return clipped_noise_action def action(self, state): action = self.actor_network.action(state) logger.debug("action: %s" % (action)) return action def opposite_action(self, state): logger.debug("state: %s" % (state)) action = self.actor_network.action(state) logger.debug("action: %s" % (action)) action[0] = 1 - action[0] logger.debug("opposite action: %s" % (action)) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() >= REPLAY_START_SIZE: # logger.debug("train...") self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'DDPG') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class DDPG: """docstring for DDPG""" def __init__(self, sess, data_fname, replay=False): self.name = 'DDPG' # Randomly initialize actor network and critic network # with both their target networks self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = Hp.state_dim self.action_dim = Hp.action_dim print(self.state_dim, self.action_dim) self.sess = sess self.state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.target_state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.state_network = StateEnc(self.sess, self.state_input, self.target_state_input) state_batch = self.state_network.encoding next_state_batch = self.state_network.target_encoding weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters( ) state_network_params = weights + biases + [ w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 ] self.actor_network = ActorNetwork(self.sess, Hp.n_hidden, self.action_dim, self.state_input, state_batch, next_state_batch, state_network_params) self.critic_network = CriticNetwork(self.sess, Hp.n_hidden, self.action_dim, state_batch, next_state_batch) # initialize replay buffer if replay: self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname) self.summary_str2 = None # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN) print("######## TRAINING #########") for k in range(Hp.N_TRAIN): minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size] state_batch_r = np.asarray([data[0] for data in minibatch]) state_batch = [] for j in range(Hp.categories): new_cat = np.stack(state_batch_r[:, j], axis=0) state_batch.append(new_cat) #state_batch = [np.expand_dims(state_batch, axis=1)] action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch_r = np.asarray([data[3] for data in minibatch]) next_state_batch = [] for j in range(Hp.categories): new_cat = np.stack(next_state_batch_r[:, j], axis=0) next_state_batch.append(new_cat) #next_state_batch = [np.expand_dims(next_state_batch, axis=1)] done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [Hp.batch_size, self.action_dim]) next_action_batch = self.actor_network.target_actions( self.target_state_input, next_state_batch) q_value_batch = self.critic_network.target_q( self.target_state_input, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + Hp.GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [Hp.batch_size, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, self.state_input, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.state_input, state_batch) q_gradient_batch = self.critic_network.gradients( self.state_input, state_batch, action_batch_for_gradients) self.summary_str2 = self.actor_network.train( q_gradient_batch, self.state_input, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() self.state_network.update_target() def train_off(self, minibatch): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer state_batch_r = np.asarray([data[0] for data in minibatch]) state_batch = [] for j in range(Hp.categories): new_cat = np.stack(state_batch_r[:, j], axis=0) state_batch.append(new_cat) #state_batch = [np.expand_dims(state_batch, axis=1)] action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch_r = np.asarray([data[3] for data in minibatch]) next_state_batch = [] for j in range(Hp.categories): new_cat = np.stack(next_state_batch_r[:, j], axis=0) next_state_batch.append(new_cat) #next_state_batch = [np.expand_dims(next_state_batch, axis=1)] done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [Hp.batch_size, self.action_dim]) next_action_batch = self.actor_network.target_actions( self.target_state_input, next_state_batch) q_value_batch = self.critic_network.target_q(self.target_state_input, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + Hp.GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [Hp.batch_size, 1]) # Update critic by minimizing the loss L cost, self.summary_str2 = self.critic_network.train_off( y_batch, self.state_input, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.state_input, state_batch) q_gradient_batch = self.critic_network.gradients( self.state_input, state_batch, action_batch_for_gradients) summary_str3 = self.actor_network.train(q_gradient_batch, self.state_input, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() self.state_network.update_target() return cost def action(self, state): state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) return np.multiply(action, np.array([-35.0, 35.0, 2000.0]))
class Worker: """docstring for DDPG""" def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic') def start(self, setting=0): self.env = RunEnv(visualize=True) self.setting = setting def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions( self.sess, next_state_batch) q_value_batch = self.critic_network.target_q(self.sess, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.sess, selfstate_batch) q_gradient_batch = self.critic_network.gradients( self.sess, state_batch, action_batch_for_gradients) self.actor_network.train(self.sess, q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target(self.sess) self.critic_network.update_target(self.sess) def save_model(self, saver, episode): #if self.episode % 10 == 1: if self.name == 'worker_0': saver.save(self.sess, self.model_path + "/model-" + str(episode) + ".ckpt") def noise_action(self, state, decay): # Select action a_t according to the current policy and exploration noise which gradually vanishes action = self.actor_network.action(self.sess, state) return action + self.exploration_noise.noise() * decay def action(self, state): action = self.actor_network.action(self.sess, state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE and self.training: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def work(self, coord, saver): if self.training: episode_count = self.sess.run(self.global_episodes) else: episode_count = 0 wining_episode_count = 0 total_steps = 0 print("Starting worker_" + str(self.number)) with self.sess.as_default(), self.sess.graph.as_default(): #not_start_training_yet = True while not coord.should_stop(): returns = [] rewards = [] episode_reward = 0 if np.random.rand( ) < 0.9: # change Aug20 stochastic apply noise noisy = True self.decay -= 1. / self.explore else: noisy = False self.sess.run(self.update_local_ops_actor) self.sess.run(self.update_local_ops_critic) state = self.env.reset(difficulty=self.setting) #print(observation) s = process_frame(state) print "episode:", episode_count # Train for step in xrange(self.env.spec.timestep_limit): state = process_frame(state) if noisy: action = np.clip( self.noise_action(state, np.maximum(self.decay, 0)), 0.0, 1.0 ) # change Aug20, decay noise (no noise after ep>=self.explore) else: action = self.action(state) next_state, reward, done, _ = self.env.step(action) #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done)) next_state = process_frame(next_state) self.perceive(state, action, reward * 100, next_state, done) state = next_state episode_reward += reward if done: break if episode % 5 == 0: print "episode reward:", reward_episode # Testing: #if episode % 1 == 0: if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1: # change Aug19 self.save_model(saver, episode_count) total_return = 0 ave_reward = 0 for i in xrange(TEST): state = self.env.reset() reward_per_step = 0 for j in xrange(self.env.spec.timestep_limit): action = self.action( process_frame(state)) # direct action for test state, reward, done, _ = self.env.step(action) total_return += reward if done: break reward_per_step += (reward - reward_per_step) / (j + 1) ave_reward += reward_per_step ave_return = total_return / TEST ave_reward = ave_reward / TEST returns.append(ave_return) rewards.append(ave_reward) print 'episode: ', episode, 'Evaluation Average Return:', ave_return, ' Evaluation Average Reward: ', ave_reward if self.name == 'worker_0' and self.training: sess.run(self.increment) episode_count += 1 # All done Stop trail # Confirm exit print('Done ' + self.name)
class DDPG: """docstring for DDPG""" def __init__(self, env, results_file): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) results_file.write(ActorNetwork.get_settings()) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, a_dim, s_dim): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = s_dim self.action_dim = a_dim self.time_step=0 self.max_bw = 0.0 self.max_cwnd = 0.0 self.min_rtt = 9999999.0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def learn(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): self.time_step += 1 # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) noise = self.exploration_noise.noise() # print("noise:" + str(noise)) return action + noise def choose_action(self, state): self.time_step += 1 # print("_______________________choose_action_____________________") action = self.actor_network.action(state) return action def store_transition(self, s, a, r, s_,done,episode_count): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer # print("*********************************ADD****************************") self.replay_buffer.add(s, a, r, s_, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: if((episode_count+1)%100!= 0): self.learn() # print("learn!") else: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def extract_observation(self,dataRecorder,subflow_index,state_before): # print("extracting...") value_dic = dataRecorder.get_latest_data() state_after=state_before.reshape(10,5) # observation = np.zeros((4)) observation = np.zeros((5)) t_cWnd=[0,0] t_thr=[0,0] t_rtt=[0,0] t_loss_rate=[0,0] t_unAck=[0,0] s0=[0,0,0,0,0] state=np.zeros(1) for i in range(value_dic["nbOfSubflows"]): name = "cWnd" + str(i) t_cWnd[i] = value_dic[name] name = "rtt"+str(i) t_rtt[i] = value_dic[name] name = "unAck" + str(i) t_unAck[i]=value_dic[name] name = "loss_rate" + str(i) t_loss_rate[i]=value_dic[name] name = "throughput" + str(i) t_thr[i]=value_dic[name] thr=t_thr[subflow_index] s0[0]=t_thr[subflow_index] rtt=t_rtt[subflow_index] s0[1]=t_rtt[subflow_index] cwnd=t_cWnd[subflow_index] s0[2]=t_cWnd[subflow_index] loss_rate=t_loss_rate[subflow_index] s0[3]=t_loss_rate[subflow_index] unAck=t_unAck[subflow_index] s0[4]=t_unAck[subflow_index] s0=np.array(s0) min_=s0-s0 thr_n=s0[0] thr_n_min=s0[0]-min_[0] rtt_min=s0[1]-min_[1] cwnd_n_min=s0[2]-min_[2] loss_rate_n_min=s0[3]-min_[3] unAck_n_min=s0[4]-min_[4] # loss_rate_n_min=s0[7]-min_[7] if self.max_bw<thr_n_min: self.max_bw=thr_n_min if self.max_cwnd<cwnd_n_min: self.max_cwnd=cwnd_n_min if self.max_cwnd<cwnd_n_min: self.max_cwnd=cwnd_n_min if self.min_rtt>rtt_min: self.min_rtt=rtt_min reward = thr_n_min-5*(rtt_min-self.min_rtt)-10*loss_rate_n_min print("reward:"+str(reward)+" thr_n_min:"+str(thr_n_min)+ " rtt_min:"+str(rtt_min)+" self.min_rtt :"+str(self.min_rtt)+" delta_rtt"+str(rtt_min-self.min_rtt)) # print("unAck:"+str(unAck_n_min)) if self.max_bw!=0: state[0]=thr_n_min/self.max_bw # tmp=pacing_rate_n_min/self.max_bw state=np.append(state,[5*loss_rate_n_min]) state=np.append(state,[unAck_n_min]) else: state[0]=0 state=np.append(state,[0]) state=np.append(state,[0]) state=np.append(state,[1400/cwnd]) state=np.append(state,[self.min_rtt/rtt_min]) state_after=np.delete(state_after,[0],axis = 0) state_after=np.append(state_after,state) return state_after,reward,thr_n_min,rtt_min