def __init__(self, env_name, state_dim, action_dim): self.name = 'DriverAgent' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Tensorflow Session config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) # Actor & Critic Network self.actor = ActorNetwork(self.sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) self.critic = CriticNetwork(self.sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) # Replay Memory self.memory = ReplayMemory(MEMORY_SIZE) # Loss value self.loss = 0 # loading networks. modify as you want self.saver = tf.train.Saver() if not os.path.exists(ckp_dir): print("Could not find old network weights") else: self.saver.restore(self.sess, os.path.join(ckp_dir, ckp_name)) print("Successfully loaded:", ckp_name)
def __init__(self, trainable=1, load_model=1): super(Agent, self).__init__('Agent') np.random.seed(1337) self.step = 0 self.state_cache = dict() self.action_cache = dict() # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.trainable = trainable K.set_session(self.sess) self.actor = ActorNetwork(self.sess, globalConfig.TAU, globalConfig.LRA) self.critic = CriticNetwork(self.sess, globalConfig.TAU, globalConfig.LRC) self.buff = ReplayBuffer(globalConfig.BUFFER_SIZE) # Create replay buffer self.cnt = 0 if load_model == 1: # Now load the weight print("Now we load the weight") try: self.actor.model.load_weights("actormodel.h5") self.critic.model.load_weights("criticmodel.h5") self.actor.target_model.load_weights("actormodel.h5") self.critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") self.graph = tf.get_default_graph()
def __init__(self): # Variable Definition self.ep = 0 self.replace_freq = cf.REPLACE_FREQ self.save_freq = cf.SAVE_FREQ self.WEIGHT_PATH = cf.WEIGHT_PATH # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True # Session Setup & graph self.sess = tf.Session(config=config) from keras import backend as K K.set_session(self.sess) self.tf_graph = tf.get_default_graph() # Network self.actor = ActorNetwork(self.sess, self.tf_graph, cf.STATE_DIM, cf.ACTION_DIM, cf.TAU, cf.LRA) self.critic = CriticNetwork(self.sess, self.tf_graph, cf.STATE_DIM, cf.ACTION_DIM, cf.TAU, cf.LRC) self.memory = ReplayBuffer(cf.BUFFER_SIZE) # write graph self.timestamp = int(time.time()) self.sum_writer = tf.summary.FileWriter(cf.TMP_PATH + '/ddpg' + str(self.timestamp), self.tf_graph)
def __init__(self, input_shape, actions, discount_factor, replay_buffer, minibatch_size, logger, name="ppo"): self.input_shape = input_shape self.action_space = actions self.discount_factor = discount_factor self.minibatch_size = minibatch_size self.actor = ActorNetwork(self.input_shape, self.action_space, self.discount_factor, self.minibatch_size) self.critic = CriticNetwork(self.input_shape, self.action_space, self.discount_factor, self.minibatch_size) self.states = [] self.actions = [] self.values = [] self.masks = [] self.rewards = [] self.actions_probs = [] self.actions_onehot = [] super(PPOAgent, self).__init__(logger, replay_buffer, name=name)
class Main(): def __init__(self, board): self.TRAIN = False self.board = board self.ME = 1 self.OPPONENT = 2 self.action_network = ActionNetwork(objective=self.ME) self.critic_network = CriticNetwork( params=[len(board.features) * 5 + 2, 60, 1], pattern_finder=board.pattern_finder) # 神经网络结构 if os.path.exists(CRITIC_NETWORK_SAVEPATH): self.critic_network.layers = pickle.load( open(CRITIC_NETWORK_SAVEPATH, 'rb')) logDebug('Using existing model at ' + CRITIC_NETWORK_SAVEPATH) self.system_model = SystemModel(who=self.ME) def run_me(self): try: if board.whose_turn is None: board.whose_turn = self.ME actions, values = self.get_candidate_actions() action, value = self.action_network.forward( self.board, actions, values) board_now = deepcopy(self.board) board_next = self.system_model.forward(action) # pp.do_mymove here if self.TRAIN: reward = 1.0 if check_win( board_now, action[0], action[1], who=self.ME) else 0.0 self.critic_network.back_propagation(board_now, board_next, reward) except: logTraceBack() raise Exception('f**k') def run_opponent(self, x, y): try: if board.whose_turn is None: board.whose_turn = self.OPPONENT board_now = deepcopy(self.board) self.board[x][y] = 2 board_next = self.board reward = 0.0 self.critic_network.back_propagation(board_now, board_next, reward) except: logTraceBack raise Exception('f**k') def get_candidate_actions(self): actions = Adjacent(self.board) # 返回临近的点 values = [] for action in actions: board_next = self.system_model.forward_if(self.board, action) values.append(self.critic_network.forward(board_next)) return actions, values
def __init__(self, board): self.TRAIN = False self.board = board self.ME = 1 self.OPPONENT = 2 self.action_network = ActionNetwork(objective=self.ME) self.critic_network = CriticNetwork( params=[len(board.features) * 5 + 2, 60, 1], pattern_finder=board.pattern_finder) # 神经网络结构 if os.path.exists(CRITIC_NETWORK_SAVEPATH): self.critic_network.layers = pickle.load( open(CRITIC_NETWORK_SAVEPATH, 'rb')) logDebug('Using existing model at ' + CRITIC_NETWORK_SAVEPATH) self.system_model = SystemModel(who=self.ME)
def __init__(self, BUFFER_SIZE, BATCH_SIZE, GAMMA, TAU, LRA, LRC, action_dim, state_dim, EXPLORE, epsilon, total_loss, total_reward, train_indicator, s_t, a_t, r_t, s_t1, done, speed_lmit, sensor_dis): self.BUFFER_SIZE = BUFFER_SIZE self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.TAU = TAU self.LRA = LRA self.LRC = LRC self.action_dim = action_dim self.state_dim = state_dim self.EXPLORE = EXPLORE self.epsilon = epsilon self.total_loss = total_loss self.total_reward = total_reward self.train_indicator = train_indicator self.s_t = s_t self.a_t = a_t self.r_t = r_t self.s_t1 = s_t1 self.done = done self.speed_limit = speed_lmit self.sensor_dis = sensor_dis # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) self.actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) self.critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
def __init__(self): for i in range(args.num_actors): self.actor.append( ActorNetwork(self.sess, self.state_dim, self.action_dim, 0, 0, 0)) for i in range(args.num_critics): self.critic.append( CriticNetwork(self.sess, self.state_dim, self.action_dim, -1, 0, 0))
def __init__(self, env, outfile_name, hindsight): """Initialize the DDPG object. Args: env: an instance of gym.Env on which we aim to learn a policy. outfile_name: (str) name of the output filename. """ action_dim = len(env.action_space.low) state_dim = len(env.observation_space.low) np.random.seed(1337) self.env = env self.sess = tf.compat.v1.Session() tf.keras.backend.set_session(self.sess) self.batch_size = BATCH_SIZE self.buffer = ReplayBuffer(BUFFER_SIZE) self.burn_in_memory_size = BURN_IN_MEMORY self.Critic = CriticNetwork(self.sess, state_dim, action_dim, self.batch_size, tau=TAU, learning_rate=LEARNING_RATE_CRITIC) self.noise_mu = NOISE_MU self.Noise_sigma = NOISE_SIGMA * (env.action_space.high[0] - env.action_space.low[0]) self.Actor = ActorNetwork(sess=self.sess, state_size=state_dim, action_size=action_dim, batch_size=self.batch_size, tau=TAU, learning_rate=LEARNING_RATE_ACTOR) # Defining a custom name for the Tensorboard summary. timestr = time.strftime("%Y%m%d-%H%M%S") if hindsight: save_path = "runs/HER_DDPG_" + timestr + '/' else: save_path = "runs/DDPG_" + timestr + '/' self.writer = SummaryWriter(save_path) self.outfile = outfile_name self.action_range = 1
def __init__(self, state_processor): self.BUFFER_SIZE = 1000 self.BATCH_SIZE = 32 self.GAMMA = 0.99 self.TAU = 0.001 # Target Network HyperParameters self.LRA = 0.0001 # Learning rate for Actor self.LRC = 0.001 # Learning rate for Critic self.action_dim = 21 # Target/Action self.state_dim = 131055 # columns in input state np.random.seed(1337) self.total_reward = 0. self.loss = 0 self.EXPLORE = 100000. self.reward = 0 self.done = False self.epsilon = 1 self.indicator = 0 self.train_indicator = 1 # 1 means Train, 0 means simply Run # Tensorflow GPU optimization self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.config) K.set_session(self.sess) self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.BATCH_SIZE, self.TAU, self.LRA) self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.BATCH_SIZE, self.TAU, self.LRC) self.buff = ReplayBuffer(self.BUFFER_SIZE) # Create replay buffer self.combat_buff = ReplayBuffer( self.BUFFER_SIZE) # Create combat replay buffer self.turn_buff = ReplayBuffer( self.BUFFER_SIZE) # Create turn replay buffer global graph graph = tf.get_default_graph() self.state_processor = state_processor
def __init__(self, outputs, memorySize, discountFactor, learningRate_Critic, learningRate_Actor, target_update_rate, img_rows, img_cols, img_channels): """ Parameters: - outputs: output size - memorySize: size of the memory that will store each state - discountFactor: the discount factor (gamma) - learningRate: learning rate - learnStart: steps to happen before for learning. Set to 128 """ self.action_size = outputs self.memory = memory.Memory(memorySize) self.discountFactor = discountFactor self.learningRateCritic = learningRate_Critic self.learningRateActor = learningRate_Actor self.img_rows = img_rows self.img_cols = img_cols self.img_channels = img_channels self.target_update_rate = target_update_rate self.img_shape = (self.img_channels, self.img_rows, self.img_cols) if K.image_dim_ordering() == 'tf': self.img_shape = (self.img_rows, self.img_cols, self.img_channels) with tf.device(TF_DEVICE): config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) K.set_session(self.sess) print 'tf config!' self.actor = ActorNetwork(self.sess, self.img_shape, self.action_size, self.target_update_rate, self.learningRateActor) print 'actor' self.critic = CriticNetwork(self.sess, self.img_shape, self.action_size, self.target_update_rate, self.learningRateCritic) print 'critic'
def __init__(self, env, gamma=0.98, start_epsilon=1, end_epsilon=0.01, decay=500, lr=1e-4, n_batch=32, n_memory=50000, n_update_target=500, start_learning=1000, log_dir=None): self.env = env n_act = self.env.action_space.n n_obs = np.prod(self.env.observation_space.shape) self.critic = CriticNetwork(n_act=n_act, n_obs=n_obs) self.target_critic = CriticNetwork(n_act=n_act, n_obs=n_obs) self._update_target() self.memory = ReplayMemory(n_obs, n_memory) self.gamma = gamma self.s_epsilon = start_epsilon self.e_epsilon = end_epsilon self.decay = decay self.n_batch = n_batch self.n_update_target = n_update_target self.start_learning = start_learning self.criterion = nn.MSELoss() self.optimizer = optim.Adam(self.critic.parameters(), lr=lr)
def launch_train(self, train_indicator=1): # 1 means Train, 0 means simply Run print 'Launch Training Process' np.random.seed(1337) self.state_t = self.sim_inter.get_state() self.state_dim = self.sim_inter.state_dim self.actor = ActorNetwork(self.sess, self.state_dim, self.action_size, self.batch_size, self.tau, self.LRA) self.critic = CriticNetwork(self.sess, self.state_dim, self.action_size, self.batch_size, self.tau, self.LRC) self.buff = ReplayBuffer(self.buffer_size) self.load_weights() for e in range(self.episode_count): print("Episode : " + str(e) + " Replay Buffer " + str(self.buff.count())) for j in range(self.max_steps): self.loss = 0 self.total_reward = 0 self.action_t = self.action_noise(train_indicator) choose_action = np.argmax(self.action_t[0][0:4]) collision, if_pass = self.update_action(choose_action, train_indicator, e) if self.if_done: self.sim_inter = UpdateInter() self.state_t = self.sim_inter.get_state() self.if_done = False break if train_indicator: self.update_weights() self.total_correct += int(collision <= 0 and if_pass) self.total_wrong += int(collision > 0) accuracy = 0 if self.total_correct + self.total_wrong: accuracy = self.total_correct / (self.total_correct + self.total_wrong) if np.mod(e, 100) == 0: self.accuracy_all.append(accuracy) self.total_correct = 0 self.total_wrong = 0 print("TOTAL REWARD @ " + str(e) + "-th Episode : Reward " + str(self.total_reward) + " Collision " + str(collision > 0) + " Accuracy " + str(accuracy) + " All Accuracy " + str(self.accuracy_all)) print("") print("Finish.")
def playGame(DDPG_config, train_indicator=1): #1 means Train, 0 means simply Run # SETUP STARTS HERE if train_indicator > 0: folder = setup_run(DDPG_config) elif train_indicator == 0: folder = DDPG_config['EXPERIMENT'] if DDPG_config['RSEED'] == 0: DDPG_config['RSEED'] = None np.random.seed(DDPG_config['RSEED']) ACTIVE_NODES = DDPG_config['ACTIVE_NODES'] # Generate an environment if DDPG_config['ENV'] == 'balancing': env = OmnetBalancerEnv(DDPG_config, folder) elif DDPG_config['ENV'] == 'label': env = OmnetLinkweightEnv(DDPG_config, folder) action_dim, state_dim = env.a_dim, env.s_dim MU = DDPG_config['MU'] THETA = DDPG_config['THETA'] SIGMA = DDPG_config['SIGMA'] ou = OU(action_dim, MU, THETA, SIGMA) #Ornstein-Uhlenbeck Process BUFFER_SIZE = DDPG_config['BUFFER_SIZE'] BATCH_SIZE = DDPG_config['BATCH_SIZE'] GAMMA = DDPG_config['GAMMA'] EXPLORE = DDPG_config['EXPLORE'] EPISODE_COUNT = DDPG_config['EPISODE_COUNT'] MAX_STEPS = DDPG_config['MAX_STEPS'] if EXPLORE <= 1: EXPLORE = EPISODE_COUNT * MAX_STEPS * EXPLORE # SETUP ENDS HERE reward = 0 done = False wise = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, DDPG_config) critic = CriticNetwork(sess, state_dim, action_dim, DDPG_config) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer ltm = ['a_h0', 'a_h1', 'a_V', 'c_w1', 'c_a1', 'c_h1', 'c_h3', 'c_V'] layers_to_mind = {} L2 = {} for k in ltm: layers_to_mind[k] = 0 L2[k] = 0 vector_to_file(ltm, folder + 'weightsL2' + 'Log.csv', 'w') #Now load the weight try: actor.model.load_weights(folder + "actormodel.h5") critic.model.load_weights(folder + "criticmodel.h5") actor.target_model.load_weights(folder + "actormodel.h5") critic.target_model.load_weights(folder + "criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("OMNeT++ Experiment Start.") # initial state of simulator s_t = env.reset() loss = 0 for i in range(EPISODE_COUNT): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) total_reward = 0 for j in range(MAX_STEPS): print('step ', j) epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) if train_indicator and epsilon > 0 and (step % 1000) // 100 != 9: noise_t[0] = epsilon * ou.evolve() a = a_t_original[0] n = noise_t[0] a_t[0] = np.where((a + n > 0) & (a + n < 1), a + n, a - n).clip(min=0, max=1) # execute action s_t1, r_t, done = env.step(a_t[0], j) # print(s_t1) print('reward ', r_t) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer scale = lambda x: x #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = scale(np.asarray([e[0] for e in batch])) actions = scale(np.asarray([e[1] for e in batch])) rewards = scale(np.asarray([e[2] for e in batch])) new_states = scale(np.asarray([e[3] for e in batch])) dones = np.asarray([e[4] for e in batch]) y_t = np.zeros([len(batch), action_dim]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator and len(batch) >= BATCH_SIZE: loss = critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) # does this give an output like train_on_batch above? NO actor.train(states, grads) actor.target_train() critic.target_train() with open(folder + 'lossLog.csv', 'a') as file: file.write(pretty(loss) + '\n') total_reward += r_t s_t = s_t1 for layer in actor.model.layers + critic.model.layers: if layer.name in layers_to_mind.keys(): L2[layer.name] = np.linalg.norm( np.ravel(layer.get_weights()[0]) - layers_to_mind[layer.name]) # vector_to_file(np.ravel(layer.get_weights()[0]), folder + 'weights_' + layer.name + 'Log.csv', 'a') layers_to_mind[layer.name] = np.ravel( layer.get_weights()[0]) # if max(L2.values()) <= 0.02: # wise = True if train_indicator and len(batch) >= BATCH_SIZE: vector_to_file([L2[x] for x in ltm], folder + 'weightsL2' + 'Log.csv', 'a') vector_to_file(a_t_original[0], folder + 'actionLog.csv', 'a') vector_to_file(noise_t[0], folder + 'noiseLog.csv', 'a') if 'PRINT' in DDPG_config.keys() and DDPG_config['PRINT']: print("Episode", "%5d" % i, "Step", "%5d" % step, "Reward", "%.6f" % r_t) print("Epsilon", "%.6f" % max(epsilon, 0)) att_ = np.split(a_t[0], ACTIVE_NODES) for _ in range(ACTIVE_NODES): att_[_] = np.insert(att_[_], _, -1) att_ = np.concatenate(att_) print("Action\n", att_.reshape(ACTIVE_NODES, ACTIVE_NODES)) print(max(L2, key=L2.get), pretty(max(L2.values()))) step += 1 if done or wise: break if step % 1000 == 0: # writes at every 1000 step if (train_indicator): actor.model.save_weights(folder + "actormodel.h5", overwrite=True) actor.model.save_weights(folder + "actormodel" + str(step) + ".h5") with open(folder + "actormodel.json", "w") as outfile: outfile.write(actor.model.to_json(indent=4) + '\n') critic.model.save_weights(folder + "criticmodel.h5", overwrite=True) critic.model.save_weights(folder + "criticmodel" + str(step) + ".h5") with open(folder + "criticmodel.json", "w") as outfile: outfile.write(critic.model.to_json(indent=4) + '\n') print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down print("Finish.")
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 24 #of sensors input np.random.seed(1337) vision = False EXPLORE = 300000. episode_count = 20000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1.0 # epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight load_name = "sample_v0_40" print("Now we load the weight") try: actor.model.load_weights("saved/actormodel_{}.h5".format(load_name)) critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name)) actor.target_model.load_weights( "saved/actormodel_{}.h5".format(load_name)) critic.target_model.load_weights( "saved/criticmodel_{}.h5".format(load_name)) print("Weight load successfully") except: print("Cannot find the weight") plt.figure() overall_scores = [] model_name = "sample_v0" print("TORCS Experiment Start.") attacks = [] for i in range(-10, 0): val = i / 10.0 attacks.append([77, val]) # for i in range(45, 55): # attacks.append([i, -1.5]) # attacks.append([i, 1.5]) for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) # if np.mod(i, 3) == 0: # ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error # else: # ob = env.reset() ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) total_reward = 0. cur_sample = [] for j in range(max_steps): # if j == 50: # time.sleep(0.099) # continue loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # if j > 120: noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] if j < 20 and train_indicator: a_t[0][1] += 0.5 # if j == 71: # print("cp attack!") # if a_t[0][0] > 0: # a_t[0][0] = -0.3 # else: # a_t[0][0] = 0.3 # print("%.2f"%a_t[0][0]) # a_t[0][2] += 0.7 # if ob.speedX > 0.6: # a_t[0][1] = 0 if (j == attacks[i][0]): print('cp attack on {} with {}'.format(attacks[i][0], attacks[i][1])) a_t[0][0] = attacks[i][1] ob, r_t, done, info = env.step(a_t[0]) print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format( j, r_t, a_t[0][0], a_t[0][1], a_t[0][2]) # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm) # if(r_t < -50): # r_t -= 10000 # done = True if j > 20 and ob.rpm <= 0.09426: r_t -= 1000 done = True theta = 0.1 s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1]) # print(np.linalg.norm(s_t1_new - s_t1)) # s_t1 = s_t1_new buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer cur_step_sample = [ s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done ] cur_sample.append(cur_step_sample) # #Do the batch update # batch = buff.getBatch(BATCH_SIZE) # states = np.asarray([e[0] for e in batch]) # actions = np.asarray([e[1] for e in batch]) # rewards = np.asarray([e[2] for e in batch]) # new_states = np.asarray([e[3] for e in batch]) # dones = np.asarray([e[4] for e in batch]) # y_t = np.asarray([e[1] for e in batch]) # target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) # for k in range(len(batch)): # if dones[k]: # y_t[k] = rewards[k] # else: # y_t[k] = rewards[k] + GAMMA*target_q_values[k] # if (train_indicator): # loss += critic.model.train_on_batch([states,actions], y_t) # a_for_grad = actor.model.predict(states) # grads = critic.gradients(states, a_for_grad) # actor.train(states, grads) # actor.target_train() # critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if j > 200: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("saved/actormodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("saved/criticmodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") s = "{},{},{:.3f},{},{}\n".format(i, j, total_reward, attacks[i][0], attacks[i][1]) with open('logs/attack_{}.csv'.format(model_name), 'a') as the_file: the_file.write(s) # overall_scores.append(total_reward) # plt.clf() # plt.plot(overall_scores) # plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step/10000))) # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile: # pickle.dump(cur_sample, outfile) env.end() # This is for shutting down TORCS print("Finish.")
def start_training(goal_position): debug = True env = Environment( debug, goal_position ) #Put here all teh function needed for the interaction with the env observ_dim = env.num_states actions_dim = env.num_actions #Define buffer size and dimension buffer_size = 5000 miniBatch_size = 32 #Define Hyperparameters values gamma = 0.98 #learning parameter --> discount factor: model the fact that future reward are worth less than immediate reward #MQ value factor, if settled near 1 means tha learning is quickly tau = 0.001 # neural networks updating #training parameters explore = 10000 max_episode = 5000 max_steps_in_ep = 10000 reward = 0 done = False epsilon = 0.9 #exploration exploitation value indicator = 0 plot_reward = False save_stats = True #Create Empty array for Plotting VAriables ep_reward = [] episode = [] distance = [] distance_step = [] step_reward = [] #Define goal pos only for print purpose distance_error = [] goal_position = [2.0, 3.0] episode_check = 0 desired_checking_episode = 10 #If running on RDS uncomment this part #Tensorflow GPU optimization # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # sess = tf.Session(config=config) # from keras import backend as K # K.set_session(sess) # #Say to tensorflow to run on CPU config = tf.ConfigProto(device_count={'GPU': 0}) sess = tf.Session(config=config) K.set_session(sess) #Define the actor, critic Network and Buffer actor = ActorNetwork(env, sess) critic = CriticNetwork(env, sess) replay_buffer = ReplayBuffer() saved_path = '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved' #/Model_Weights_saved' save_directory = os.path.join(os.getcwd(), saved_path) try: actor.model.load_weights( "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/499_actor_weights.h5" ) actor.model_target.load_weights( "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/499_actor_weights.h5" ) critic.model.load_weights( '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Critic_weights/499_critic_model.h5' ) critic.model_target.load_weights( "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Critic_weights/499_critic_model.h5" ) #critic.model_target.load_weights("/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/219_critic_weights.h5") print("WEIGHTS LOAD CORRECTLY") except: print("ERR: WEIGHTS LOAD UNCORRECTLY") if not os.path.isdir( save_directory): #return true if path is in an existing directory os.makedirs(save_directory) os.chdir(save_directory) #plot graphs settings if (plot_reward): plt.ion() #turn the interactive mode on plt.title('Training Curve') plt.xlabel('Episodes') plt.ylabel('Total Reward') plt.grid() plt.ion() plt.title('Distance Error') plt.xlabel('Episodes') plt.ylabel('Cartesian Error') plt.grid() #Principal Training LOOP for ep in range(500, max_episode): #receive initial observation state state_t = env._reset( ) #reset environment ---> waiting for take off -> give also the state information relative to the actual drone position ecc state_t = np.asarray( state_t ) #create an array that is the state at time t : errorX,errorY, Terminal total_reward = [0] #initialize reward terminal = [False] #flag relative to the training phase step = 0 #number of iteration inside eac episode episode_check = episode_check + 1 while (terminal[0] == False): if step > 200: #200: break # exit from the main loop step = step + 1 # if debug: # print('###############################') #print('step: {}'.format(step)) print( '############################################################') loss = 0 epsilon -= 1.0 / explore #define the expolre exploit probabilities action_t = np.zeros( [1, actions_dim] ) #create a zero array with the same dimesion of the number of actions noise_t = np.zeros([1, actions_dim]) #noise array #the current action is selected according to current policy and exploration noise #The action is predicted from the actor network without noise action_t_initial = actor.model.predict( state_t.reshape(1, state_t.shape[0]) ) #state_t.reshape(1, state_t.shape[0])) #make prediction given the state input,shape gives the dimension of the vector. #print('action_t_initial', action_t_initial) #adding noise to the action predicted noise_t[0][0] = OUhlenbeck_noise(epsilon, action_t_initial[0][0]) noise_t[0][1] = OUhlenbeck_noise(epsilon, action_t_initial[0][1]) #noise_t[0][2] = OUhlenbeck_noise(epsilon,action_t_initial[0][2]) action_t[0][0] = action_t_initial[0][0] + noise_t[0][0] action_t[0][1] = action_t_initial[0][1] + noise_t[0][1] #Step, Apply action in the environment and reach a new state state_t1, reward_t, terminal = env._step(action_t[0], step) #print('state_t1 : {}'.format(state_t1)) state_t1 = np.asarray(state_t1) #create array of the new state #Now the sequence state_t, actions, reward, state_t1 must be add to the replay buffer experience replay_buffer.add_experience(state_t, action_t[0], reward_t, state_t1, terminal) #Sample a new experience (set of sate, action, state1, reward, terminal) from batch mini_batch = replay_buffer.take_experience() states_buff = np.asarray([i[0] for i in mini_batch]) actions_buff = np.asarray([i[1] for i in mini_batch]) reward_buff = np.asarray([i[2] for i in mini_batch]) state_new_buff = np.asarray([i[3] for i in mini_batch]) terminal_buff = np.asarray([i[4] for i in mini_batch]) #istantiate a y_target vector which must be of the same dimesion of the length of the mini batch #y_target = np.asarray([i[1] for i in mini_batch]) #it is only to have the array of the desired dimension #Predic an action from Actor Network given the state_new_buff from mini_batch action_new_buff = actor.model_target.predict(state_new_buff) #Take the prediction from the Critic network about possible Q target relatives to the new_state and action taken from mini batch Q_target_predicted = critic.model_target.predict( [state_new_buff, action_new_buff]) # print('Q_target_predicted', Q_target_predicted) # print('reward_buff', reward_buff) #Update the target of the Q value evaluating the BElmann Equation y_target = [] for j in range(len(mini_batch)): if terminal_buff[j]: #y_target[j] = reward_buff[j] y_target.append(reward_buff[j]) else: y_target.append( reward_buff[j] + gamma * Q_target_predicted[j] ) #it append every time an array and create a sort of list #i resize all in order to obtain an array with 1 column and many rows as the dimension of the batch y_target = np.resize(y_target, [len(mini_batch), 1]) #Evaluate the loss error utilizing the model.train_on_batch and update the weights of the critic #having as target the y_target evaluated from the belmann equation loss = loss + critic.model.train_on_batch( [states_buff, actions_buff], y_target) # L = 1/N * sum(y_target - Q(si,ai|theta^Q)^2) #The actor policy is updated using the sampled policy gradient ############ see https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html for the full expalantion #An action is predicted taking the states from the buffer. The action predicted will be used to evaluate the increasing of the critic_gradient action_for_grad = actor.model.predict(states_buff) #The actor network is trained computing the gradient of the critic network repect to actions. #This because the actor network must be trained to follow the maximum gradient increasing direction of the critic network that represent in fact the q network. #Like in Q learning, in the Q table, you follow tha action that increase the Q value. Sa me choose here, only different is that instead of having a value #to follow we have the gradient of the Critic NEtwork critic_gradient = critic.gradients(states_buff, action_for_grad) #The actor network is trained having as input the states from which the critic gradient is computed and as target the critic_gradient itself. #The goal of the actor networ is to output actions that goes in the direction of the gradient and every time maximize it actor.actor_train(states_buff, critic_gradient) #The last two rows are done in order to updates the target network #theta^Q = tau*theta^Q +(1- tau)*theta^Q' actor.target_net_train() critic.target_net_train() #Evaluate distance error fro print purpose error_x = (goal_position[0] - state_t[0]) error_y = (goal_position[1] - state_t[1]) distance_error = math.sqrt(error_x * error_x + error_y * error_y) #Update Total Reward #print('reward_t', reward_t) if not reward_t[0]: reward_t[0] = -100 * distance_error total_reward[0] = total_reward[0] + reward_t[0] #The new state becomes the actual state state_t = state_t1 #### Save distance and reward for each step only for pllotting purpose distance_step.append(distance_error) step_reward.append(reward_t[0]) if (terminal[0] == True or step == 200): distance_step_mat = np.asarray(distance_step) step_reward_mat = np.asarray(step_reward) distance_step_name = 'Statistics/Step_Statistics/%d_distance_step.csv' % ( ep) step_reward_name = 'Statistics/Step_Statistics/%d_step_reward.csv' % ( ep) np.savetxt( distance_step_name, distance_step_mat, delimiter="," ) #Nel post processing in matlab importare il vettore episode su asse x e fare plot con reward e distance su asse y np.savetxt(step_reward_name, step_reward_mat, delimiter=",") distance_step_mat = [] step_reward_mat = [] distance_step = [] step_reward = [] #Save Model and Weights every 50 episodes as a checkpoint print( 'episode: {}, steps: {}, tot_rewards: {}, terminal: {}'.format( ep, step, total_reward, terminal)) print('distance_error:{}, pos_x: {}, pos_y: {}'.format( distance_error, state_t[0], state_t[1])) #if ((step+1)%10 == 0): if (episode_check == desired_checking_episode): #save Model action_model_name = 'Actor_weights/%d_actor_model.h5' % (ep) critic_model_name = 'Critic_weights/%d_critic_model.h5' % (ep) save_path = os.path.join(save_directory, action_model_name) actor.model.save(action_model_name) #True if you want to overwrite critic.model.save(critic_model_name) print('Model Saved in path: %s' % save_directory) #Save Weights model_ext = ".h5" model_ext2 = ".json" action_save_weights_name = 'Actor_weights/%d_actor_weights' % (ep) actor.model.save_weights(action_save_weights_name + model_ext, overwrite=True) #Save Weights with open(action_save_weights_name + model_ext2, "w") as outfile: json.dump(actor.model.to_json(), outfile) #save Model Archutecture, not weights critic_save_weights_name = 'critic_weights/ %d_critic_weights' % ( ep) critic.model.save_weights(critic_save_weights_name + model_ext, overwrite=True) with open(critic_save_weights_name + model_ext2, "w") as outfile: json.dump(critic.model.to_json(), outfile) print('Weights Saved in path: %s' % save_directory) ####################### #Save Statistics if (save_stats): episode.append(ep) ep_reward.append(total_reward[0]) distance.append(distance_error) if (episode_check == desired_checking_episode): ep_reward_mat = np.asarray(ep_reward) episode_mat = np.asarray([episode]) distance_mat = np.asarray(distance) episode_mat = np.resize(episode_mat, [ep, 1]) episode_name = 'Statistics/%d_episode.csv' % (ep) episode_reward_name = 'Statistics/%d_reward.csv' % (ep) distance_name = 'Statistics/%d_distance.csv' % (ep) np.savetxt( episode_name, episode_mat, delimiter="," ) #Nel post processing in matlab importare il vettore episode su asse x e fare plot con reward e distance su asse y np.savetxt(episode_reward_name, ep_reward_mat, delimiter=",") np.savetxt(distance_name, distance_mat, delimiter=",") episode_check = 0
def playGame(checkpoints=None, train_indicator=1, eps=1.0): #1 means Train, 0 means simply Run BUFFER_SIZE = 40000 BATCH_SIZE = 16 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.01 #Learning rate for Actor LRC = 0.05 #Lerning rate for Critic vision = True action_dim = 3 #Steering/Acceleration/Brake if vision: state_dim = (64, 64, 3) #of sensors input else: state_dim = 29 np.random.seed(1337) EXPLORE = 1000000. episode_count = 2000 max_steps = 8000000 reward = 0 done = False step = 0 epsilon = eps indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) summary_writer = tf.train.SummaryWriter('logs', graph_def=sess.graph_def) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA, vision, summary_writer) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC, vision) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer history = History() # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) log_file = open('train_log.log', 'w') #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel_{}.h5".format(checkpoints)) critic.model.load_weights("criticmodel_{}.h5".foramt(checkpoints)) actor.target_model.load_weights("actormodel_{}.h5".format(checkpoints)) critic.target_model.load_weights( "criticmodel_{}.h5".format(checkpoints)) print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") max_reward = 0 min_reward = 0 for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() if vision: history.fill((ob.img)) s_t = history.get() else: s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. total_damage = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) if vision: a_t_original = actor.model.predict( s_t.reshape((-1, ) + state_dim)) else: a_t_original = actor.model.predict(s_t.reshape( 1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.30, 0.30) noise_t[0][1] = 0.1 + train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) damage = ob.damage if vision: last_s_t = history.get().copy() history.add((ob.img)) next_s_t = history.get().copy() if np.mod(step, 4) == 0: buff.add(last_s_t, a_t[0], r_t, next_s_t, done) #Add replay buffer s_t1 = history.get() else: s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) if vision: target_q_values = critic.target_model.predict([ new_states.reshape((-1, ) + state_dim), actor.target_model.predict(new_states).reshape( (-1, ) + (action_dim, )) ]) else: target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator and buff.count() >= 1000: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t total_damage += damage s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel_{}.h5".format(i), overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel_{}.h5".format(i), overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) max_reward = max(max_reward, total_reward) min_reward = min(min_reward, total_reward) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward) + " EPS " + str(epsilon)) print("Total Step: " + str(step) + ' Max: ' + str(max_reward) + ' Min: ' + str(min_reward)) print("") env.end() # This is for shutting down TORCS print("Finish.")
# L2 REGULARISATION L2C = 0.00 L2A = 0.0 env = gym.make(ENVIRONMENT_NAME) action_dim = env.action_space.shape[0] action_high = +1. action_low = -1. input_dim = env.observation_space.shape[0] sess = tf.InteractiveSession(config=tf.ConfigProto( intra_op_parallelism_threads=2)) actor = ActorNetwork(sess, input_dim, action_dim, BATCH_SIZE, TAU, LRA, L2A) critic = CriticNetwork(sess, input_dim, action_dim, BATCH_SIZE, TAU, LRC, L2C) buff = ReplayBuffer(BUFFER_SIZE) # exploration = OUNoise(action_dim) #env.monitor.start('experiments/' + 'cartPoli-v0',force=True) reward_vector = np.zeros(10000) for ep in range(10000): # open up a game state s_t, r_0, done = env.reset(), 0, False #s_t = s_t.reshape() REWARD = 0 # exploration.reset() for t in range(1000):
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 30 GAMMA = 0.99 TAU = 0.0001 #Target Network HyperParameters LRA = 0.00001 #Learning rate for Actor LRC = 0.0001 #Lerning rate for Critic action_dim = 1 #Steering/Acceleration/Brake state_dim = 15 #of sensors input np.random.seed(1337) vision = False EXPLORE = 1000000. episode_count = 3000 max_steps = 1000000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 t_dt = 0.0005 #TCP/IP communication for MATLAB - Python HOST = '0.0.0.0' PORT = 40000 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 4096) s.bind((HOST, PORT)) #Matlab client waiting s.listen(1) print("waiting for response from client at port ", PORT) conn, addr = s.accept() #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) Lateral = 0 #Carsim export(input factor) variable catch s_t try: ob_exports = conn.recv(4096) except KeyboardInterrupt: #conn.shutdown() conn.close() break ob_exports1 = json.loads(ob_exports.decode('utf-8')) print('export=', ob_exports1) if not ob_exports: #conn.shutdown() conn.close() break t_current = ob_exports1[0] T_bar_Tq = ob_exports1[1] / 10 LatG = ob_exports1[2] YawRate = ob_exports1[3] / 50 Yaw = ob_exports1[4] / 3.14 Lateral = ob_exports1[5] / 20 Steer_SW = ob_exports1[6] / 6000 StrAV_SW = ob_exports1[7] / 5000 Steer_L1 = ob_exports1[8] / 180 Steer_R1 = ob_exports1[9] / 180 Steer_L2 = ob_exports1[10] / 4 Steer_R2 = ob_exports1[11] / 4 Xcg_TM = ob_exports1[12] / 1000 Ycg_TM = ob_exports1[13] / 300 Zcg_TM = ob_exports1[14] / 45 curv = ob_exports1[15] # print('T_bar_Tq=',T_bar_Tq) # print('LatG=',LatG) s_t = np.hstack((T_bar_Tq, LatG, YawRate, Yaw, Lateral, Steer_SW, StrAV_SW, Steer_L1, Steer_R1, Steer_L2, Steer_R2, Xcg_TM, Ycg_TM, Zcg_TM, curv)) print('s_t=', s_t) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) print('a_t_original=', a_t_original) print('a_t_original=', a_t_original) a_t_inv = a_t_original[0][0] print(a_t_inv.shape) critic_gradient = critic.gradients(s_t.reshape(1, s_t.shape[0]), a_t_original) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.00, 0.00) # noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] # a_t[0][1] = a_t_original[0][1] + noise_t[0][1] # a_t[0][2] = a_t_original[0][2] + noise_t[0][2] a_t[0][0] = a_t[0][0] * 3500 t_current = t_current + t_dt print('t_next=', t_current) print(a_t[0]) at = np.array(a_t[0]) # print("at=",at) at1 = np.insert(at, 0, t_current) # print('at1=,',at1) at2 = list(at1) print('at2=,', at2) #provide action value to matlab try: at_json = json.dumps(at2) a = '\r\n' at_json1 = at_json + a # print('at_json1',at_json1) at_json2 = at_json1.encode('utf-8') # print('at_json2',at_json2) conn.sendall(at_json2) except KeyboardInterrupt: #conn.shutdown() conn.close() break #Carsim export(input factor) variable catch s_t1 try: ob_exports = conn.recv(4096) except KeyboardInterrupt: #conn.shutdown() conn.close() break ob_exports1 = json.loads(ob_exports.decode('utf-8')) print('s_t1=', ob_exports1) if not ob_exports: #conn.shutdown() conn.close() break T_bar_Tq1 = ob_exports1[0] / 10 LatG1 = ob_exports1[1] YawRate1 = ob_exports1[2] / 50 Yaw1 = ob_exports1[3] / 3.14 Lateral1 = ob_exports1[4] / 20 Steer_SW1 = ob_exports1[5] / 6000 StrAV_SW1 = ob_exports1[6] / 5000 Steer_L11 = ob_exports1[7] / 180 Steer_R11 = ob_exports1[8] / 180 Steer_L21 = ob_exports1[9] / 4 Steer_R21 = ob_exports1[10] / 4 Xcg_TM1 = ob_exports1[11] / 1000 Ycg_TM1 = ob_exports1[12] / 300 Zcg_TM1 = ob_exports1[13] / 45 curv = ob_exports1[14] r_t = ob_exports1[15] done = ob_exports1[16] # print('T_bar_Tq1=',T_bar_Tq1) print('r_t=', r_t) # if abs(Lateral1) > 1 or abs(Yaw1) > 1 : if t_current > 20 or abs(Yaw1) > 1: break s_t1 = np.hstack( (T_bar_Tq1, LatG1, YawRate1, Yaw1, Lateral1, Steer_SW1, StrAV_SW1, Steer_L11, Steer_R11, Steer_L21, Steer_R21, Xcg_TM1, Ycg_TM1, Zcg_TM1, curv)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) # print ("Rewards=",rewards) # print ("Actions=",actions) # print ("states=",states) # print (states.shape) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # print("rt1=",target_q_values) # print(target_q_values.shape) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) # print("a_for_grad=",a_for_grad) # print(a_for_grad.shape) grads = critic.gradients(states, a_for_grad) # print("grads=",grads) # print(grads.shape) if step > 30: grads_factor = gradient_inverter(critic_gradient, a_t_inv, p_min=-1, p_max=1, BATCH_SIZE=30) else: grads_factor = 1 # print("grads_factor=",grads_factor) grads_factor1 = np.asarray(grads_factor) grads3 = grads * grads_factor1 # print("grads3=",grads3) actor.train(states, grads3) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "t_current", t_current, "Action", a_t, "Reward", r_t, "Loss", loss, "step", step) step += 1 if done: break #s.shutdown() if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") # s.close() # TCP/IP socket close s.close() # TCP/IP socket close print("Finish.")
def RunSim(train_toggle): #1 means Train, 0 means simply Run BUFFER_SIZE = 1000000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 LRA = 0.0002 #Actor Learning LRC = 0.002 #Critic Learning #Sim options lqr_toggle = 0 action_dim = 1 state_dim = 2 hist_rt = [] hist_reward = [] d_exploration = 200000. num_max_episodes = 250 max_seconds = 15 reward = -100 done = False step = 0 epsilon = 1 Noise_magnitude = 15 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Initialize Actor and Critic Networks actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) #Create replay buffer buff = ReplayBuffer(BUFFER_SIZE) #Load the ODE environment env = ODE(np.zeros(state_dim)) # Load network parameters from previous training try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") except: print("Weight Error") print("Taining state is ", train_toggle) for i in range(num_max_episodes): print("Current episode : " + str(i)) # Before every episote completly reset the envrionment ob = env.reset() s_t = np.asarray(ob)[:, None].T total_reward = 0. max_steps = int(max_seconds / env.dt) # Run the episode for j in range(max_steps): loss = 0 epsilon -= 1.0 / d_exploration a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t) noise_t[0] = noise_toggle * train_toggle * max( epsilon, 0.05) * Noise_magnitude * np.random.randn(action_dim) a_t[0] = a_t_original[0] + noise_t[0] #a_t[0][1] = a_t_original[0][1] + noise_t[0][1] if lqr_toggle == 1: a_t[0] = -klqr.dot(np.asarray(s_t[0])) ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.asarray(ob)[:, None].T buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0][0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3][0] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) #Output from target Networks target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) #TD for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_toggle): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t hist_rt.append(r_t) s_t = s_t1 step += 1 if done: break hist_reward.append(np.mean(hist_rt)) hist_rt = [] if np.mod(i, 1) == 0: plt.close() hist = np.asarray(env.hist) u = np.asarray(env.u_hist) fig = plt.figure(figsize=(15, 5)) plt.suptitle('Episode ' + str(i)) ax1 = fig.add_subplot(121) rhist = np.asarray(env.ref_hist) time = np.linspace(0, j * env.dt, num=j + 1) ax1.plot(time, hist[:, 0], 'b', label='Output') ax1.plot(time, rhist[:, 0], 'b-.', label='Reference') ax1.set_ylabel('y(t)') ax1.set_xlabel('t') ax2 = fig.add_subplot(122) ax2.plot(time, u[:], 'g') ax2.set_ylabel('Control Signal u(t)') ax2.set_xlabel('t') #plt.show(block=False) fig.savefig('figures/results' + str(i) + '.pdf') fig2 = plt.figure(figsize=(5, 5)) plt.title('Mean Reward Per Espisode') plt.plot(hist_reward, 'go') ymax = 100 #plt.ylim(-100, 0) #plt.show(block=False) fig2.savefig('figures/reward.pdf') print 'reward', total_reward if np.mod(i, 10) == 0: if (train_toggle): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile)
class Game(object): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) mx = 0 LOG = 0 trainnum = 0 modelcnt = 0 noiselevel = 0.5 rpm = rpm(2000000) TAU = 0.001 lr_actor = 3e-4 lr_critic = 3e-4 train_interval = 1 train_times = 100 action_dim = 18 state_dim = 76 max_steps = 1000 // 4 cnt = 0 GAMMA = 0.96 BATCH_SIZE = 128 log_path = './logs' import threading as th lock = th.Lock() actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, lr_actor) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, lr_critic) callback = TensorBoard(log_path) callback.set_model(critic.model) def write_log(self, callback, names, logs, batch_no): output = open('logs/data.txt', 'w') output.write(str(self.LOG) + ' ' + str(self.trainnum)) output.close() for name, value in zip(names, itertools.repeat(logs)): summary = tf.Summary() summary_value = summary.value.add() summary_value.simple_value = value summary_value.tag = name callback.writer.add_summary(summary, batch_no) callback.writer.flush() callback = TensorBoard(self.log_path) def play(self, env, cnt): episode_memory = [] step = 0 s_t = env.reset() total_reward = 0. sp = 0. noise_t = np.zeros([1, self.action_dim]) a_t = np.zeros([1, self.action_dim]) noise = self.noiselevel self.noiselevel = noise * 0.999 for j in range(self.max_steps): self.lock.acquire() global graph with graph.as_default(): a_t_original = self.actor.model.predict(np.array([s_t])) self.lock.release() noise = noise * 0.98 if cnt % 3 == 0: if j % 5 == 0: noise_t[0] = np.random.randn(self.action_dim) * noise elif cnt % 3 == 1: if j % 5 == 0: noise_t[0] = np.random.randn(self.action_dim) * noise * 2 else: noise_t = np.zeros([1, self.action_dim]) a_t = a_t_original + noise_t for i in range(self.action_dim): if (a_t[0][i] > 1): a_t[0][i] = 1 elif (a_t[0][i] < 0): a_t[0][i] = 0 ob, r_t, done, _, pen = env.step(a_t[0]) s_t1 = ob episode_memory.append([s_t, a_t[0], r_t - pen, done, s_t1]) total_reward += r_t sp += pen s_t = s_t1 step += 1 if done or step == 1000 / 4 - 1: if total_reward > self.mx: self.mx = total_reward print("Episode", cnt, "Step", step, "Reward", total_reward, "max", self.mx, "penalty", sp) train_names = ['reward'] self.lock.acquire() self.LOG = self.LOG + 1 self.write_log(self.callback, train_names, total_reward, self.LOG) self.lock.release() break self.lock.acquire() for i in range(step): self.rpm.add(episode_memory[i]) self.lock.release() def playonce(self, env, T): from multi import fastenv fenv = fastenv(env, 4) self.play(fenv, T) env.rel() del fenv def play_ignore(self, env, T): import threading as th try: t = th.Thread(target=self.playonce, args=( env, T, )) t.setDaemon(True) t.start() except: print("startfail") def playifavailable(self, T): while True: remote_env = farmer.acq_env() if remote_env == False: pass else: self.play_ignore(remote_env, T) break def train(self): memory = self.rpm if memory.size() < self.BATCH_SIZE: return global graph loss = 0 for T in range(self.train_times): [states, actions, rewards, dones, new_states] = memory.sample_batch(self.BATCH_SIZE) y_t = np.asarray([0.0] * self.BATCH_SIZE) rewards = np.concatenate(rewards) self.lock.acquire() with graph.as_default(): target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) target_q_values = target_q_values.reshape( [1, target_q_values.shape[0]])[0] for k in range(self.BATCH_SIZE): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + self.GAMMA * target_q_values[k] with graph.as_default(): self.critic.model.optimizer.learning_rate = self.lr_critic logs = self.critic.model.train_on_batch([states, actions], y_t) a_for_grad = self.actor.model.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads, learning_rate=self.lr_actor) self.actor.target_train() self.critic.target_train() train_names = ['train_loss'] self.write_log(self.callback, train_names, logs, self.trainnum) self.trainnum = self.trainnum + 1 loss = loss + logs self.lock.release() print("train", memory.size(), loss) def save(self): self.modelcnt = self.modelcnt + 1 self.actor.target_model.save_weights("logs/actormodel.h5", overwrite=True) self.critic.target_model.save_weights("logs/criticmodel.h5", overwrite=True) self.actor.target_model.save_weights("logs/actormodel{}.h5".format( self.modelcnt)) self.critic.target_model.save_weights("logs/criticmodel{}.h5".format( self.modelcnt)) print("save") def pre(self): print("Now we load the weight") try: input = open('logs/data.txt', 'r') self.LOG, self.trainnum = map(int, input.read().split(' ')) print("LOG", self.LOG, "trainnum", self.trainnum) input.close() print("log found") self.critic.model.load_weights("logs/criticmodel.h5") self.critic.target_model.load_weights("logs/criticmodel.h5") self.actor.model.load_weights("logs/actormodel.h5") self.actor.target_model.load_weights("logs/actormodel.h5") print("Weight load successfully") self.rpm.load('logs/rpm.pickle') print("rmp success") except: if self.LOG > 0: print("Load fault") return False else: print("A new experiment") return True def run(self): np.random.seed(23333) episode_count = 10000 reward = 0 done = False LOSS = 0 for T in range(50): self.playifavailable(T) for T in range(episode_count): self.train() self.playifavailable(T) if np.mod(T, 100) == 0 and T >= 100: self.save() print("Finish.")
import gym import tensorflow as tf import numpy as np from ActorNetwork import ActorNetwork from CriticNetwork import CriticNetwork from ddpg import DDPG env_name ='CartPole-v1' env = gym.make(env_name) env._max_episode_steps = 200 stop_train_score=200 #stop training after reaching score for 3 consecutive episodes sess = tf.Session() critic = CriticNetwork(sess, 4, 2, 0.01, 0.001) actor = ActorNetwork(sess, 4, 2, 0.01, 0.001, activation='softmax') ddpg = DDPG(sess, actor, critic, batch_size=32) def train_game( max_steps=10000): state = env.reset() done = False r = 0 step_count = 0 while not done and step_count <= max_steps: a = ddpg.get_action_for_state(state, True, [0.6, 0.6], [0.5,0.5], [0.2,0.2]) new_state, reward, done, _ = env.step(np.argmax(a)) ddpg.step(state, a, reward, new_state, done) r += reward step_count += 1 state = new_state return r, ddpg.mean_loss
def train(train_indicator=1): env = Env() BUFFER_SIZE = 200000 BATCH_SIZE = 128 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic action_dim = env.action_dim state_dim = env.observation_space() np.random.seed(1337) EXPLORE = 100000. episode_count = 100 max_steps = 10000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) print("load model weight") try: actor.model.load_weights("model/actormodel.h5") critic.model.load_weights("model/criticmodel.h5") actor.target_model.load_weights("model/actormodel.h5") critic.target_model.load_weights("model/criticmodel.h5") print("load successfully") except: print("Cannot find the model weight") s_t = env.reset() for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 10.0, 1, 7) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0, 1, 3) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] s_t1, r_t, _ = env.step(a_t[0]) buff.add(s_t, a_t[0], r_t, s_t1, done) # env.get_memory(buff) batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if train_indicator: print("save model model") actor.model.save_weights("model/actormodel.h5", overwrite=True) # actor.model.save_weights("model/actormodel.h5", overwrite=True) # with open("model/actormodel.json", "wb") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("model/criticmodel.h5", overwrite=True) # critic.model.save_weights("model/criticmodel.h5", overwrite=True) # with open("model/criticmodel.json", "wb") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") print("Finish.") return actor
env = gym.make(ENVIRONMENT_NAME) if ENVIRONMENT_NAME is 'Hockey-v2': env_left = gym.make(TEST_ENV_LEFT) env_middle = gym.make(TEST_ENV_MIDDLE) env_right = gym.make(TEST_ENV_RIGHT) action_dim = env.action_space.shape[0] action_high = env.action_space.high action_low = env.action_space.low input_dim = env.observation_space.shape[0] sess = tf.InteractiveSession() logger = tf.train.SummaryWriter(OUT_DIR, sess.graph) actor = ActorNetwork(sess, input_dim, action_dim, BATCH_SIZE, TAU, LRA, L2A) critic = CriticNetwork(sess, input_dim, action_dim, BATCH_SIZE, TAU, LRC, L2C) buff = ReplayBuffer(BUFFER_SIZE) summary = tf.merge_all_summaries() n = OUnoise(action_dim, 0.15, NOISE) #n = OUnoise(action_dim) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(OUT_DIR) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess,ckpt.model_checkpoint_path) print("Model loaded from disk") # initialize logger L = Logger() log_not_empty = L.Load(OUT_DIR+LOG_FILE)
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1,action_dim]) noise_t = np.zeros([1,action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 30) == 0: print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator = 1): BUFFER_SIZE = 10000 BATCH_SIZE = 128 GAMMA = 0.9 TAU = 0.01 lr_actor = 1e-3 lr_critic = 1e-3 train_interval = 1 train_times = 20 action_dim = 3 state_dim = 2 np.random.seed(2333) EXPLORE = 5000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 LOSS = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, lr_actor) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, lr_critic) buff = ReplayBuffer(BUFFER_SIZE) env = gym.make('MountainCarContinuous-v0') # env = wrappers.Monitor(env, '/tmp/cartpole-experiment-1') # Now load the weight print("Now we load the weight") try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") loss = 0 for i in range(episode_count): # print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) ob = env.reset() s_t = ob total_reward = 0. for j in range(max_steps): epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) for k in range(action_dim): noise_t[0][k] = train_indicator * max(epsilon, 0) * OU().function(a_t_original[0][k], 0, 1.0, 0.3) action = a_t_original[0] env.render() ob, r_t, done, _ = env.step(action) s_t1 = ob # print(ob) buff.add(s_t, a_t_original[0], r_t, s_t1, done) total_reward += r_t s_t = s_t1 step += 1 if done: print("Episode", i, "Step", step, "Reward", total_reward) break if (train_indicator) and i % train_interval == 0: loss = 0 for T in range(train_times): batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([0.0 for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] loss = critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() print("Episode", i, "Step", step, "Loss", loss) if np.mod(i, 3) == 0: if (train_indicator): # print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) #print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward), "loss: " + str(LOSS), "epsilon" + str(epsilon)) # print("Total Step: " + str(step)) # print("") print("Finish.") env.close()
def playGame(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) # Now load the weight # print("Now we load the weight") # try: # actor.model.load_weights("actormodel.h5") # critic.model.load_weights("criticmodel.h5") # actor.target_model.load_weights("actormodel.h5") # critic.target_model.load_weights("criticmodel.h5") # print("Weight load successfully") # except: # print("Cannot find the weight") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) print ob.track total_reward = 0. stucked = 0 for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake if random.random() <= 0.1: print("********Now we apply the brake***********") noise_t[0][2] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
class Agent(EventTask): # ENV -> AGENT OBSERVE = 0 # AGENT -> ENV ACT = 0 def __init__(self, trainable=1, load_model=1): super(Agent, self).__init__('Agent') np.random.seed(1337) self.step = 0 self.state_cache = dict() self.action_cache = dict() # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.trainable = trainable K.set_session(self.sess) self.actor = ActorNetwork(self.sess, globalConfig.TAU, globalConfig.LRA) self.critic = CriticNetwork(self.sess, globalConfig.TAU, globalConfig.LRC) self.buff = ReplayBuffer(globalConfig.BUFFER_SIZE) # Create replay buffer self.cnt = 0 if load_model == 1: # Now load the weight print("Now we load the weight") try: self.actor.model.load_weights("actormodel.h5") self.critic.model.load_weights("criticmodel.h5") self.actor.target_model.load_weights("actormodel.h5") self.critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") self.graph = tf.get_default_graph() def do_job(self, job_obj): (job_id, data, env_proxy) = job_obj if self.verbose: print(str.format("Processing Job id:{}..", job_id)) with self.graph.as_default(): if job_id == self.OBSERVE: # observation is_first_shot, done, n_pigs, n_stones, n_woods, n_ices, n_tnts, bird_type, im, r_t, current_level = data print(str.format("----> observation from {}, level = {}", env_proxy.get_client_ip(), current_level)) print(str.format("first shot:{}, reward:{}, episode done:{}", is_first_shot, r_t, done)) print(str.format("# pigs={}, # stones={}, # woods={}, # ices={}, n_tnts={}, bird={}" , n_pigs, n_stones, n_woods, n_ices, n_tnts, bird_type)) # print('im shape=', im.shape) s_t1 = [np.array(im), np.array([n_pigs, n_stones, n_woods, n_ices, n_tnts]), np.array([bird_type])] # store transition into replay buffer try: self.buff.add(self.state_cache[env_proxy], self.action_cache[env_proxy], r_t, s_t1, done) print("store transition into replay buffer") except Exception as e: print("first shot of the game") pass if self.buff.count() > 0: print("Do the batch update...") # Do the batch update batch = self.buff.getBatch(globalConfig.BATCH_SIZE) # states = np.asarray([e[0] for e in batch]) images = [e[0][0] for e in batch] num_objects = [e[0][1] for e in batch] birds = [e[0][2] for e in batch] states = [np.array(images), np.array(num_objects), np.array(birds)] actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_images = [e[3][0] for e in batch] new_num_objects = [e[3][1] for e in batch] new_birds = [e[3][2] for e in batch] new_states = [np.array(new_images), np.array(new_num_objects), np.array(new_birds)] dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) # print('batch update shape, size =', len(batch)) # print(np.array(images).shape) # print(np.array(num_objects).shape) # print(np.array(birds).shape) new_a = self.actor.target_model.predict(states) # print('new_a=\n', new_a) target_q_values = self.critic.target_model.predict(new_states + [new_a]) # print('q values =\n', target_q_values) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + globalConfig.GAMMA * target_q_values[k] if self.trainable: print('loss =', self.critic.model.train_on_batch(states + [actions], y_t)) a_for_grad = self.actor.model.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads) self.actor.target_train() self.critic.target_train() # select action a_t s_t = s_t1 pixels = np.reshape(s_t[0], tuple([1]) + s_t[0].shape) num_objects = np.reshape(s_t[1], tuple([1]) + s_t[1].shape) input_bird = np.reshape(s_t[2], tuple([1]) + s_t[2].shape) # print(pixels.shape, num_objects.shape, input_bird.shape) a_t = self.actor.model.predict([pixels, num_objects, input_bird]) target = math.floor(a_t[0][0] * np.sum(s_t[1] - 0.00001)) # avoid index out of range error high_shot = 1 if a_t[0][1] > 0.5 else 0 tap_time = math.floor(65 + a_t[0][2] * 25) print('raw a_t w/o noise=', a_t) print(str.format("next action w/o noise: target({}), high_shot({}), tap time({})", target, high_shot, tap_time)) if self.trainable == 1: # random noise noise1 = np.random.randn(1) * 0.2 noise2 = np.random.randn(1) * 0.3 noise3 = np.random.randn(1) * 0.2 print('random noise=', noise1, noise2, noise3) a_t[0][0] += noise1 a_t[0][1] += noise2 a_t[0][2] += noise3 a_t[0][0] = min(1, a_t[0][0]) a_t[0][0] = max(0, a_t[0][0]) a_t[0][1] = min(1, a_t[0][1]) a_t[0][1] = max(0, a_t[0][1]) a_t[0][2] = min(1, a_t[0][2]) a_t[0][2] = max(0, a_t[0][2]) target = math.floor(a_t[0][0] * np.sum(s_t[1] - 0.00001)) # avoid index out of range error high_shot = 1 if a_t[0][1] > 0.5 else 0 tap_time = math.floor(65 + a_t[0][2] * 25) print('raw a_t w/ noise =', a_t) print(str.format("next action w/ noise: target({}), high_shot({}), tap time({})", target, high_shot, tap_time)) # cache self.state_cache[env_proxy] = s_t self.action_cache[env_proxy] = a_t[0] # execute an action env_proxy.execute(target, high_shot, tap_time) self.cnt += 1 if self.cnt % globalConfig.model_save_interval == 0: if self.trainable == 1: print("Saving model....") self.actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) self.critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile)
def playGame(train_indicator=0): # 1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.00001 # Learning rate for Actor LRC = 0.0001 # Lerning rate for Critic server_number = 5 # node_number = 18 hot_node_number = 150 action_dim = hot_node_number # Number of servers state_dim = hot_node_number * (server_number + 1 + 10 ) # 1000 node * 10 features # baseline = 4e-05 #load&locality of baselines np.random.seed(500) # vision = False EXPLORE = 100000. episode_count = 100 max_steps = 100000 line_number = 1000 step_number = 35 # reward = 0 done = False step = 0 epsilon = 1 # indicator = 0 # Tensorflow GPU optimization config = tf.ConfigProto() # config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a MDS environment env = MetaEnvironment(server_number) # Now load the weight print("Now we load the weight") try: actor.model.load_weights("model/actormodel-" + str(server_number) + ".h5") critic.model.load_weights("model/criticmodel-" + str(server_number) + ".h5") actor.target_model.load_weights("model/actormodel-" + str(server_number) + ".h5") critic.target_model.load_weights("model/criticmodel-" + str(server_number) + ".h5") print("Weight load successfully") except: print("Cannot find the weight") print("Experiment Start.") f = open("query.txt") queryList = [] for line in f.readlines(): line = line.strip() queryList.append(line) f.close() sumLoc = 0 sumLod = 0 lossList = [] mdsLoadList = [[] for x in range(server_number)] for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) # if np.mod(i, 3) == 0: # ob = env.reset(relaunch=True) #relaunch every 3 episode because of the memory leak error # else: # ob = env.reset() traceList = queryList[0:line_number] # Reset s_t = env.state(traceList) # Get State from env localityList = [] loadList = [] total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) # add noise a_t_original = actor.model.predict(s_t) for k in range(action_dim): noise_t[0][k] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][k], 0.0, 0.60, 0.30) for m in range(action_dim): a_t[0][m] = a_t_original[0][m] # + noise_t[0][m] migration = env.take_actions(a_t[0]) print("migration", migration) tracelist = queryList[(j + 1) * line_number:(j + 2) * line_number] s_t1 = env.state(tracelist) # Update state from env # r_t = 0.5*env.locality() + 50*env.load() - baseline # print("gagaga", 1e5*env.locality() + 1e7*env.load()) # 1.5, 3, 2 x = 1e5 * env.locality() + 1e7 * env.load() - 1.5 * migration # x = 1e5*env.locality() + 1.5 * 1e7*env.load() # r_t = 1.0 / (1.0 + np.exp(-(x/50))) r_t = x if j == step_number: done = True else: done = False buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) states = states.reshape(len(batch), -1) new_states = new_states.reshape(len(batch), -1) actions = actions.reshape(len(batch), -1) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss, "Locality", env.locality(), "Load", env.load()) print("Episode", i, "Step", step, "Reward", r_t, "Loss", loss, "Locality", env.locality(), "Load", env.load()) lossList.append(loss) localityList.append(env.locality()) loadList.append(env.load()) for index in range(server_number): mdsLoadList[index].append(env.loadList[index]) step += 1 if done: break curLocalitySum = sum(localityList) curLoadSum = sum(loadList) # f = open('' + str(server_number) + '.txt', 'w') # f.write(','.join(map(str, lossList))) # f.close() # f = open('anglecut-mdsload-' + str(server_number) + '.txt', 'w') # for i in range(server_number): # f.write(','.join(map(str, mdsLoadList[i]))) # f.write('\n') # f.close() # print("写入成功") if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("model/actormodel-" + str(server_number) + ".h5", overwrite=True) with open("model/actormodel-" + str(server_number) + ".json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("model/criticmodel-" + str(server_number) + ".h5", overwrite=True) with open("model/criticmodel-" + str(server_number) + ".json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) # print("Final Locality:", env.final_locality(), "Final Load Balancing:", env.final_load()) # env.clear() print("") # env.end() print("Finish.")
class DDPG(object): """A class for running the DDPG algorithm.""" def __init__(self, env, outfile_name, hindsight): """Initialize the DDPG object. Args: env: an instance of gym.Env on which we aim to learn a policy. outfile_name: (str) name of the output filename. """ action_dim = len(env.action_space.low) state_dim = len(env.observation_space.low) np.random.seed(1337) self.env = env self.sess = tf.compat.v1.Session() tf.keras.backend.set_session(self.sess) self.batch_size = BATCH_SIZE self.buffer = ReplayBuffer(BUFFER_SIZE) self.burn_in_memory_size = BURN_IN_MEMORY self.Critic = CriticNetwork(self.sess, state_dim, action_dim, self.batch_size, tau=TAU, learning_rate=LEARNING_RATE_CRITIC) self.noise_mu = NOISE_MU self.Noise_sigma = NOISE_SIGMA * (env.action_space.high[0] - env.action_space.low[0]) self.Actor = ActorNetwork(sess=self.sess, state_size=state_dim, action_size=action_dim, batch_size=self.batch_size, tau=TAU, learning_rate=LEARNING_RATE_ACTOR) # Defining a custom name for the Tensorboard summary. timestr = time.strftime("%Y%m%d-%H%M%S") if hindsight: save_path = "runs/HER_DDPG_" + timestr + '/' else: save_path = "runs/DDPG_" + timestr + '/' self.writer = SummaryWriter(save_path) self.outfile = outfile_name self.action_range = 1 def generate_burn_in(self): num_actions = self.env.action_space.shape[0] state = self.env.reset() state = np.array(state) done = False for i in range(self.burn_in_memory_size): action = np.random.uniform( -1.0, 1.0, size=num_actions ) #Randomly generating actions for the buffer burn_in new_state, reward, done, info = self.env.step(action) new_state = np.array(new_state) self.buffer.add(state, action, reward, new_state, done) state = new_state if (done): state = self.env.reset() state = np.array(state) done = False def evaluate(self, num_episodes, num_iteration): """Evaluate the policy. Noise is not added during evaluation. Args: num_episodes: (int) number of evaluation episodes. Returns: success_rate: (float) fraction of episodes that were successful. average_return: (float) Average cumulative return. """ test_rewards = [] success_vec = [] plt.figure(figsize=(12, 12)) for i in range(num_episodes): s_vec = [] state = self.env.reset() s_t = np.array(state) total_reward = 0.0 done = False step = 0 success = False while not done: s_vec.append(s_t) a_t = self.Actor.actor_network.predict(s_t[None])[0] # import pdb; pdb.set_trace() new_s, r_t, done, info = self.env.step(a_t) if done and "goal" in info["done"]: success = True new_s = np.array(new_s) total_reward += r_t s_t = new_s step += 1 success_vec.append(success) test_rewards.append(total_reward) if i < 9: plt.subplot(3, 3, i + 1) s_vec = np.array(s_vec) pusher_vec = s_vec[:, :2] puck_vec = s_vec[:, 2:4] goal_vec = s_vec[:, 4:] plt.plot(pusher_vec[:, 0], pusher_vec[:, 1], '-o', label='pusher') plt.plot(puck_vec[:, 0], puck_vec[:, 1], '-o', label='puck') plt.plot(goal_vec[:, 0], goal_vec[:, 1], '*', label='goal', markersize=10) plt.plot([0, 5, 5, 0, 0], [0, 0, 5, 5, 0], 'k-', linewidth=3) plt.fill_between([-1, 6], [-1, -1], [6, 6], alpha=0.1, color='g' if success else 'r') plt.xlim([-1, 6]) plt.ylim([-1, 6]) if i == 0: plt.legend(loc='lower left', fontsize=28, ncol=3, bbox_to_anchor=(0.1, 1.0)) if i == 8: # Comment out the line below to disable plotting. # plt.show() buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) return np.mean(success_vec), np.mean(test_rewards), np.std( test_rewards), buf def train(self, num_episodes, hindsight=False): """Runs the DDPG algorithm. Args: num_episodes: (int) Number of training episodes. hindsight: (bool) Whether to use HER. """ self.generate_burn_in() for i in range(num_episodes): state = self.env.reset() s_t = np.array(state) total_reward = 0.0 done = False step = 0 loss = 0 store_current_states = [] store_actions = [] store_dones = [] self.ActionNoise = EpsilonNormalActionNoise( self.noise_mu, self.Noise_sigma, EPSILON) while not done: # Collect one episode of experience, saving the states and actions # to store_states and store_actions, respectively. action = np.clip( self.ActionNoise( self.Actor.actor_network.predict(s_t[None])[0]), -self.action_range, self.action_range) # import pdb; pdb.set_trace() new_state, reward, done, info = self.env.step(action) new_state = np.array(new_state) store_current_states.append(s_t) store_actions.append(action) store_dones.append(done) self.buffer.add(s_t, action, reward, new_state, done) transition_minibatch = np.asarray( self.buffer.get_batch(self.batch_size)) target_actions = self.Actor.target_actor_network.predict( np.stack(transition_minibatch[:, 3])) target_Qs = self.Critic.target_critic_network.predict( [np.stack(transition_minibatch[:, 3]), target_actions]) target_values = np.stack( transition_minibatch[:, 2]) + GAMMA * target_Qs.reshape(-1) reward_indices = np.where(transition_minibatch[:, 4] == True)[0] target_values[ reward_indices] = target_values[reward_indices] - GAMMA * ( target_Qs.reshape(-1)[reward_indices]) # present_values = self.Critic.critic_network.predict([transition_minibatch[:,0][0][None],transition_minibatch[:,1][0][None]]) history = self.Critic.critic_network.fit( [ np.stack(transition_minibatch[:, 0]), np.stack(transition_minibatch[:, 1]) ], target_values, batch_size=self.batch_size, epochs=1, verbose=0) #Update Actor Policy actor_actions = self.Actor.actor_network.predict( np.stack(transition_minibatch[:, 0])) action_grads = self.Critic.gradients( np.stack(transition_minibatch[:, 0]), actor_actions)[0] gradient_update = self.Actor.train( np.stack(transition_minibatch[:, 0]), action_grads) # import pdb; pdb.set_trace() self.Critic.update_target() self.Actor.update_target() loss += history.history['loss'][-1] s_t = new_state step += 1 total_reward += reward if hindsight: # For HER, we also want to save the final next_state. store_current_states.append(new_state) store_current_states_copy = copy.deepcopy(store_current_states) her_states, her_rewards = self.env.apply_hindsight( store_current_states_copy) # her_states, her_rewards, her_actions = self.add_hindsight_replay_experience(store_current_states, store_actions) for k in range(len(her_states) - 1): if her_rewards[k] == 0: self.buffer.add(her_states[k], store_actions[k], her_rewards[k], her_states[k + 1], True) break else: self.buffer.add(her_states[k], store_actions[k], her_rewards[k], her_states[k + 1], store_dones[k]) del store_current_states, store_actions, store_dones store_states, store_actions, store_dones = [], [], [] loss = loss / step self.writer.add_scalar('train/loss', loss, i) self.writer.add_scalar("Training Reward VS Episode", total_reward, i) # Logging print("Episode %d: Total reward = %d" % (i, total_reward)) print("\tTD loss = %.2f" % (loss, )) print("\tSteps = %d; Info = %s" % (step, info['done'])) if i % 100 == 0: successes, mean_rewards, std_rewards, buf = self.evaluate( 10, i) image = tf.image.decode_png(buf.getvalue(), channels=3) image = image.eval(session=self.sess) self.writer.add_image('Performance', image, i, dataformats='HWC') self.writer.add_scalar('mean_reward', mean_rewards, i) self.writer.add_scalar('std_reward', std_rewards, i) print('Evaluation: success = %.2f; return = %.2f' % (successes, mean_rewards)) with open(self.outfile, "a") as f: f.write("%.2f, %.2f,\n" % (successes, mean_rewards)) def add_hindsight_replay_experience(self, states, actions): """Relabels a trajectory using HER. Args: states: a list of states. actions: a list of states. """ her_states, her_rewards = self.env.apply_hindsight(states) her_actions = actions # print('her_states size: ', len(her_states)) # print('her_rewardssize: ', len(her_rewards)) # print('her_states: ', her_states) # print('her_rewards: ', her_rewards) return her_states, her_rewards, her_actions
def playGame(train_indicator=1): # 1 means Train, 0 means simply Run BUFFER_SIZE = 100000 # 缓存能力,网络储存能力 BATCH_SIZE = 32 # 批尺寸,一次处理样本数 GAMMA = 0.99 # 折扣系数 TAU = 0.001 # Target Network HyperParameters 目标网络超系数 LRA = 0.0001 # Learning rate for Actor Actor网络学习率 LRC = 0.001 # Lerning rate for Critic Critic网络学习率 action_dim = 3 # Steering/Acceleration/Brake 加速/转向/刹车 state_dim = 29 # of sensors input 29个传感器输入 np.random.seed(1337) # 随机数种子,如果使用相同的数字,则每次产生的随机数相同,应该是定义了一个随机的初始值。 vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU 管理策略,此处使用动态内存申请策略 config = tf.ConfigProto() config.gpu_options.allow_growth = True # 硬性限制GPU使用率为0.4 # config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True,gear_change=False) # Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") except: print("Cannot find the weight") theTime = datetime.datetime.now() # 获取系统当前时间 theTime = theTime.strftime('%y-%m-%d_%H:%M:%S') # 转换为字符串形式作为CSV文件头 folder_path = "practise_progress/" + theTime + "/" # 只适用于Linux系统 if not os.path.exists(folder_path): os.makedirs(folder_path) print("folder created") else: print("folder existed") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. csvfileHeader = "practise_progress/" + theTime + "/" + " Episode " + str(i) + ".csv" fileHeader = ["Step", "TrackPos", "SpeedX", "SpeedY", "SpeedZ", "Action_Steering", "Action_Acceleration", "Action_Brake", "Reward", "Loss"] csvFile = open(csvfileHeader, "w") writer = csv.writer(csvFile) writer.writerow(fileHeader) for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0 , 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05) # The following code do the stochastic brake # if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 csvData = [step, ob.trackPos, ob.speedX * 300, ob.speedY * 300, ob.speedZ * 300, a_t[0, 0], a_t[0, 1], a_t[0, 2], r_t, loss] """ 参数记录 轮次 步骤计数 车辆位置 X轴速度 Y轴速度 Z轴速度 加速输出 转向输出 刹车输出 回报 损失函""" writer.writerow(csvData) print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: csvFile.close() break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=1): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.00005 #Learning rate for Actor LRC = 0.0005 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 29 #of sensors input np.random.seed(1337) vision = False EXPLORE = 200000. if train_indicator: episode_count = 1000 else: episode_count = 20 max_steps = 4000 step = 0 if train_indicator: epsilon = 1 else: epsilon = 0 min_laptime = 10000000 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) #Now load the weight # loading networks print("Now we load the weight") saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_networks/") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. # totalLaptime = 0. for j in range(max_steps): loss = 0 if train_indicator: epsilon -= 1.0 / EXPLORE epsilon = max(epsilon, 0.10) a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0], train_indicator) s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_predict(new_states, actor.target_predict(new_states)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA*target_q_values[k] if (train_indicator): loss += critic.train_on_batch(states, actions, y_t) a_for_grad = actor.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(step, 100) == 0: print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime) step += 1 if i == 0: break if done: break # if np.mod(i, 3) == 0: if (train_indicator) and i > 0: if env.lapTime < min_laptime and env.num_lap == 10: min_laptime = env.lapTime print("Now we save model") saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i)) print("TOTAL REWARD @ " + str(i) +"-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
def playGame(train_indicator=0): #1 means Train, 0 means simply Run time.sleep(1) BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 3 #Steering/Acceleration/Brake state_dim = 24 #of sensors input np.random.seed(1337) vision = False EXPLORE = 300000. episode_count = 20000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1.0 # epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False) pre_model = load_model("weights_rescale_all-0000.hdf5") # x = np.array([ 4.82767379e-01, 5.92105016e-02, 3.61700505e-01, 2.74807483e-01, # 2.31401995e-01, 2.07236990e-01, 1.95800006e-01, 1.89892501e-01, # 1.84837490e-01, 1.81293502e-01, 1.77807003e-01, 1.74377009e-01, # 1.71005994e-01, 1.66384503e-01, 1.61247000e-01, 1.52030498e-01, # 1.35238498e-01, 1.11962005e-01, 8.79574940e-02, 4.76383008e-02, # 4.78339800e-01, 6.97819047e-01, 4.60800716e-01, 5.00754069e-01, # -1.00000000e+00, 9.99979496e-01, 8.71338917e-13]) # x_s = np.array([x, x]) # pre_y = pre_model.predict(x_s) # print(x_s[0]) # print(pre_y[0]) #Now load the weight load_name = "sample_v0_40" print("Now we load the weight") try: actor.model.load_weights("saved/actormodel_{}.h5".format(load_name)) critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name)) actor.target_model.load_weights( "saved/actormodel_{}.h5".format(load_name)) critic.target_model.load_weights( "saved/criticmodel_{}.h5".format(load_name)) print("Weight load successfully") except: print("Cannot find the weight") plt.figure() overall_scores = [] model_name = "sample_v0" print("TORCS Experiment Start.") for i in range(episode_count): print("Episode : " + str(i) + " Replay Buffer " + str(buff.count())) if np.mod(i, 3) == 0: ob = env.reset( relaunch=True ) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) total_reward = 0. cur_sample = [] attack_valid = 1 gap = (i / 10) / 100.0 attack_step = -1 attack_target = 0 for j in range(max_steps): # if j == 50: # time.sleep(0.099) # continue loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # if j > 120: noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] if j < 20 and train_indicator: a_t[0][1] += 0.5 # os.system("scrot saved_pic/{}.png".format(j)) if j == 80: print("cp attack!") a_t[0][0] = -1.0 if j == 83: os.system("scrot saved_pic/{}.png".format(j)) # if a_t[0][0] > 0: # a_t[0][0] = -0.3 # else: # a_t[0][0] = 0.3 # print("%.2f"%a_t[0][0]) # a_t[0][2] += 0.7 # if ob.speedX > 0.6: # a_t[0][1] = 0 # if(step == 60): # a_t[0][0] = 1.0 # s_t_scaled = rescale_state(s_t) # # print(s_t[0]) # s_t_0 = restore_state(s_t_scaled) # # print(s_t_0[0]) # new_a_t = actor.model.predict(s_t_0.reshape(1, s_t_0.shape[0])) # s_t_scaled_list = np.array([np.copy(s_t_scaled) for val in range(21)]) # actions = np.array([np.copy(a_t[0]) for val in range(21)]) # for val in range(21): # actions[val][0] = -1.0 + val/10.0 # # print(actions) # x_0 = np.hstack((s_t_scaled_list, actions)) # # print(x_0.shape, s_t_scaled_list.shape, actions.shape) # pre_y = pre_model.predict(x_0) # # print(x_0[0]) # # print(pre_y[0]) # steer_index = int(a_t[0][0]*10.0 + 10.0) # for pre_step in range(2): # restore_new_Y = restore_states(pre_y) # actions = actor.model.predict(restore_new_Y) # x_step1 = np.hstack((pre_y, actions)) # pre_y = pre_model.predict(x_step1) # for index in range(21): # diff = calsulate_d(pre_y[index]) - calsulate_d(pre_y[steer_index]) # pro = np.random.random() # if diff > gap and attack_valid == 1 and pro > 0.8 and j > 50: # a_t[0][0] = -1.0 + index/10.0 # print("adv!", diff, "pro:", pro) # attack_step = j # attack_target = a_t[0][0] # attack_valid -= 1 # dis_list = np.array([(calsulate_d(st) - calsulate_d(pre_y[steer_index])) for st in pre_y]) # print("{:.2f}".format(max(dis_list)*100000)) # print("{}".format(max(dis_list)*100000)) # s_t_scaled = np.copy(s_t1) # s_t_scaled[0] = rescale_data(s_t_scaled[0], 0.5) # s_t_scaled[20] = rescale_data(s_t_scaled[20], 2.5) # s_t_scaled[21] = rescale_data(s_t_scaled[21], 0.7) # s_t_scaled[22] = rescale_data(s_t_scaled[22], 0.7) # s_t_scaled[23] = rescale_data(s_t_scaled[23], 0.7) # actions = actor.model.predict(s_t_scaled.reshape(1, s_t_scaled.shape[0])) # print(actions[0][0]) # ob, r_t, done, info = env.step(new_a_t[0]) ob, r_t, done, info = env.step(a_t[0]) print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format( j, r_t, a_t[0][0], a_t[0][1], a_t[0][2]) # print(a_t[0][0]) # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm) # if(r_t < -50): # r_t -= 10000 # done = True if j > 20 and ob.rpm <= 0.09426: r_t -= 1000 done = True theta = 0.1 s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ)) # action_states = [] # for i in range(-5, 6): # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1]) # print(np.linalg.norm(s_t1_new - s_t1)) # s_t1 = s_t1_new buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer # cur_step_sample = [s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done] # cur_sample.append(cur_step_sample) #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if j > 500: break if np.mod(i, 3) == 0: if (train_indicator): print("Now we save model") actor.model.save_weights("saved/actormodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("actormodel.json", "w") as outfile: # json.dump(actor.model.to_json(), outfile) critic.model.save_weights("saved/criticmodel_{}_{}.h5".format( model_name, int(step / 10000)), overwrite=True) # with open("criticmodel.json", "w") as outfile: # json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") s = "{},{},{},{},{},{:.3f}\n".format(gap, attack_step, attack_target, i, j, total_reward) attack_valid = 1 attack_step = -1 attack_target = 0 with open('logs/pm_adv_test.csv'.format(model_name), 'a') as the_file: the_file.write(s) overall_scores.append(total_reward) plt.clf() plt.plot(overall_scores) plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step / 10000))) # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile: # pickle.dump(cur_sample, outfile) env.end() # This is for shutting down TORCS print("Finish.")