def __init__( self, action_space, discount_factor=.99, # gamma ): self.rpm = rpm(100000) # 10k history self.plotter = plotter(num_lines=2) self.render = True self.training = True num_of_actions = 8 self.outputdims = num_of_actions self.discount_factor = discount_factor ids, ods = None, num_of_actions self.actor = self.create_actor_network(ids, ods) self.critic = self.create_critic_network(ids, ods) self.actor_target = self.create_actor_network(ids, ods) self.critic_target = self.create_critic_network(ids, ods) self.feed, self.joint_inference, sync_target = self.train_step_gen() sess = ct.get_session() sess.run(tf.global_variables_initializer()) sync_target()
def __init__(self, observation_space_dims, discount_factor, nb_actions = 19, rpm_size = 1000000, train_mult = 5 ): self.training = True self.discount_factor = discount_factor #self.noise_source = one_fsq_noise() #self.train_counter = 0 self.train_multiplier = train_mult self.rpm = rpm(rpm_size) # Deal only with the continuous space for now... self.inputdims = observation_space_dims self.outputdims = nb_actions def clamper(actions): return np.clip(actions, a_max = 1.0 , a_min = 0.0) self.clamper = clamper ids, ods = self.inputdims, self.outputdims
def __init__(self, **kwargs): self.lr_act = 0.0038 print(self.lr_act) self.lr_crit = 0 self.batch_size = 64 self.atoms = 80 self.actions = 3 self.channels = 9 self.gamma = 0.65 self.lambdaEntrop = 0.99 print(self.lambdaEntrop) self.lambdaCrit = 0.41667 self.weightDecay = False self.actor = CNNBase(self.channels, self.actions, self.atoms) self.optimizer_actor = optim.RMSprop( self.actor.parameters(), lr=self.lr_act, alpha=0.88, eps=1e-5 ) #, alpha= 0.99, eps=1e-5)#, weight_decay=self.weightDecay) self.memory = rpm(250000) self.maxReward = 0 self.minFrame = 0 self.AveRew = 0 self.bestEps = 0 self.ModUpdate = 0 self.Good = False self.maxSteps = 360
def train(): env = gym.make(GAME).unwrapped all_ep_r = [] memory = rpm(1000000) agent = PPO(state_space=S_DIM, action_space=A_DIM, max_episode_num=EP_MAX, episode_lens=EP_LEN, discount_factor=GAMMA, actor_learning_rate=A_LR, critic_learning_rate=C_LR, mini_batch_size=MINI_BATCH_SIZE, epochs=EPOCHS) # load weights # agent.load_weights(SAVE_INDEX) # run(env, agent) for i in range(EP_MAX): [steps, episode_r, c_time, aloss, closs] = execute_one_episode(env, agent, memory) print('Ep: %4d' % i, "|Ep_r: %i" % episode_r, '|aloss: %8.4f' % aloss, '|closs: %8.4f' % closs, '|steps: %4d' % steps, '|time: %6.4f' % c_time) if i == 0: all_ep_r.append(episode_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + episode_r * 0.1) # create_path('weights/' + SAVE_INDEX) agent.save_weights(SAVE_INDEX) plt.plot(np.arange(len(all_ep_r)), all_ep_r) plt.xlabel('Episode') plt.ylabel('Moving averaged episode reward') create_path('weights/' + SAVE_INDEX + '/figure') plt.savefig('weights/' + SAVE_INDEX + '/figure/fig.png') plt.show()
def __init__(self, observation_space_dims, discount_factor, nb_actions = 19, rpm_size = 1500000, train_mult = 10 ): self.training = True self.discount_factor = discount_factor #self.noise_source = one_fsq_noise() #self.train_counter = 0 self.train_multiplier = train_mult self.rpm = rpm(rpm_size) # Deal only with the continuous space for now... self.inputdims = observation_space_dims self.outputdims = nb_actions def clamper(actions): return np.clip(actions, a_max = 1.0 , a_min = 0.0) self.clamper = clamper ids, ods = self.inputdims, self.outputdims #with tf.device('/device:GPU:0'): self.actor = self.create_actor_network(ids,ods) self.critic = self.create_critic_network(ids,ods) self.actor_target = self.create_actor_network(ids,ods) self.critic_target = self.create_critic_network(ids,ods) self.feed, self.joint_inference, sync_target = self.train_step_gen() sess = ct.get_session() sess.run(tf.global_variables_initializer()) sync_target() import threading as th self.lock = th.Lock() self.reward_plotter = plt.figure() self.reward_collector = [] self.learn_reward_collector = [] self.phased_noise_anneal_duration = 100
def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn } self.actors = [ Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor) ] self.actor_targets = [ Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor) ] self.actor_optims = [ Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor) ] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update( self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda()
def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn, 'init_method': args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda()
def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda()
def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = discrete # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.use_cuda = use_cuda # if self.use_cuda: self.cuda()
def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn } if args.pic: self.cnn = CNN(3, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update(self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda()
def __init__(self, **kwargs): self.lr = 3e-4 self.updata_time = 0 self.batch_size = 64 self.gamma = 0.999 self.epsilon = 1.0 self.Vmin = -160 self.Vmax = 160 self.atoms = 51 self.actions = 7 self.policy = DQN(10, self.actions, self.atoms) self.target = DQN(10, self.actions, self.atoms) self.reward = [] self.memory = rpm(500000) self.target.load_state_dict(self.policy.state_dict()) self.optimizer_policy = optim.Adam(self.policy.parameters(), lr=self.lr) self.support = torch.linspace(self.Vmin, self.Vmax, self.atoms).to(device)
def __init__(self, args): self.rpm = rpm(1000000) self.render = True self.training = True self.noise_source = one_fsq_noise() self.train_multiplier = args.train_multiplier self.inputdims = args.observation_space_dims low = 0.0 high = 1.0 num_of_actions = args.action_space self.action_bias = high / 2.0 + low / 2.0 self.action_multiplier = high - self.action_bias def clamper(actions): return np.clip(actions, a_max=high, a_min=low) self.clamper = clamper self.outputdims = args.action_space self.discount_factor = args.gamma ids, ods = self.inputdims, self.outputdims print('inputdims:{}, outputdims:{}'.format(ids, ods)) self.actor = models.create_actor_network(ids, ods).cuda() self.critic = models.create_critic_network(ids, ods).cuda() self.actor_target = models.create_actor_network(ids, ods).cuda() self.critic_target = models.create_critic_network(ids, ods).cuda() self.critic_criterion = nn.MSELoss().cuda() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=3e-4) self.plot_epoch = [0] self.plot_reward = [0] import threading as th self.lock = th.Lock()
button_value = pbThread.get() lcd_clean() if(button_value == 0): #E0 a = ' {0:8s} '.format(time.strftime("%H:%M:%S", time.gmtime())) b = '{}'.format(time.strftime("%a, %d %b %Y", time.gmtime())) lcd_string = '{}\n{}'.format(a,b) elif(button_value == 1): #E1 lcd_string = '{3:16s}\n{0:1d} {1:4.0f} {2:3.0f}'.format( s_struct.s_TelemInfo['mGear'], s_struct.s_TelemInfo['mEngineRPM'], s_struct.s_TelemInfo['mVelocity'], rpm.rpm(s_struct.s_TelemInfo['mEngineRPM']) ) elif(button_value == 2): #E4 lcd_string = 'T {0:2.3f} B {1:2.3f}\nS {2:2.3f} C {3:2.3f}'.format( s_struct.s_TelemInfo['mUnfilteredThrottle'], s_struct.s_TelemInfo['mUnfilteredBrake'], s_struct.s_TelemInfo['mUnfilteredSteering'], s_struct.s_TelemInfo['mUnfilteredClutch'] ) elif(button_value == 3): lcd_string = '{0:16s}\n{1:16s}'.format( string_shift(s_struct.s_TelemInfo['mVehicleName']),
data, addr = sc.recvfrom(512) unpacked_data = unpack('64s64s2l12f', data) s_TelemInfo['mVehicleName'] = unpacked_data[0].split(b'\x00')[0].decode() s_TelemInfo['mTrackName'] = unpacked_data[1].split(b'\x00')[0].decode() s_TelemInfo['mLapNumber'] = unpacked_data[2] s_TelemInfo['mGear'] = unpacked_data[3] s_TelemInfo['mDeltaTime'] = unpacked_data[4] s_TelemInfo['mLapStartET'] = unpacked_data[5] s_TelemInfo['mEngineRPM'] = unpacked_data[6] s_TelemInfo['mEngineWaterTemp'] = unpacked_data[7] s_TelemInfo['mEngineOilTemp'] = unpacked_data[8] s_TelemInfo['mClutchRPM'] = unpacked_data[9] s_TelemInfo['mUnfilteredThrottle'] = unpacked_data[10] s_TelemInfo['mUnfilteredBrake'] = unpacked_data[11] s_TelemInfo['mUnfilteredSteering'] = unpacked_data[12] s_TelemInfo['mUnfilteredClutch'] = unpacked_data[13] s_TelemInfo['mSteeringArmForce'] = unpacked_data[14] s_TelemInfo['mVelocity'] = unpacked_data[15] lcd_string = '{3:16s}\n{0:1d} {1:4.0f} {2:3.0f}'.format( s_TelemInfo['mGear'], s_TelemInfo['mEngineRPM'], s_TelemInfo['mVelocity'], rpm.rpm(s_TelemInfo['mEngineRPM']) ) lcd.message(lcd_string) lcd.cmd(0x2) print(lcd_string)
def __init__( self, observation_space_dims, action_space, stack_factor=1, discount_factor=.99, # gamma # train_skip_every=1, train_multiplier=1, ): self.rpm = rpm(1000000) # 1M history self.plotter = plotter(num_lines=3) self.render = True self.training = True self.noise_source = one_fsq_noise() self.train_counter = 0 # self.train_skip_every = train_skip_every self.train_multiplier = train_multiplier self.observation_stack_factor = stack_factor self.inputdims = observation_space_dims * self.observation_stack_factor # assume observation_space is continuous self.is_continuous = True if isinstance(action_space, Box) else False if self.is_continuous: # if action space is continuous low = action_space.low high = action_space.high num_of_actions = action_space.shape[0] self.action_bias = high / 2. + low / 2. self.action_multiplier = high - self.action_bias # say high,low -> [2,7], then bias -> 4.5 # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7] def clamper(actions): return np.clip(actions, a_max=action_space.high, a_min=action_space.low) self.clamper = clamper else: num_of_actions = action_space.n self.action_bias = .5 self.action_multiplier = .5 # map (-1,1) into (0,1) def clamper(actions): return np.clip(actions, a_max=1., a_min=0.) self.clamper = clamper self.outputdims = num_of_actions self.discount_factor = discount_factor ids, ods = self.inputdims, self.outputdims print('inputdims:{}, outputdims:{}'.format(ids, ods)) self.actor = self.create_actor_network(ids, ods) self.critic = self.create_critic_network(ids, ods) self.actor_target = self.create_actor_network(ids, ods) self.critic_target = self.create_critic_network(ids, ods) # print(self.actor.get_weights()) # print(self.critic.get_weights()) self.feed, self.joint_inference, sync_target = self.train_step_gen() sess = ct.get_session() sess.run(tf.global_variables_initializer()) sync_target() import threading as th self.lock = th.Lock() if not hasattr(self, 'wavegraph'): num_waves = self.outputdims * 2 + 1 def rn(): r = np.random.uniform() return 0.2 + r * 0.4 colors = [] for i in range(num_waves - 1): color = [rn(), rn(), rn()] colors.append(color) colors.append([0.2, 0.5, 0.9]) self.wavegraph = wavegraph(num_waves, 'actions/noises/Q', np.array(colors))
def __init__( self, observation_space, action_space, stack_factor=1, discount_factor=.99, # gamma train_skip_every=1, ): self.rpm = rpm(1000000) # 1M history self.render = True self.noise_source = one_fsq_noise() self.train_counter = 0 self.train_skip_every = train_skip_every self.observation_stack_factor = stack_factor self.inputdims = observation_space.shape[ 0] * self.observation_stack_factor # assume observation_space is continuous self.is_continuous = True if isinstance(action_space, Box) else False if self.is_continuous: # if action space is continuous low = action_space.low high = action_space.high num_of_actions = action_space.shape[0] self.action_bias = high / 2. + low / 2. self.action_multiplier = high - self.action_bias # say high,low -> [2,7], then bias -> 4.5 # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7] def clamper(actions): return np.clip(actions, a_max=action_space.high, a_min=action_space.low) self.clamper = clamper else: num_of_actions = action_space.n self.action_bias = .5 self.action_multiplier = .5 # map (-1,1) into (0,1) def clamper(actions): return np.clip(actions, a_max=1., a_min=0.) self.clamper = clamper self.outputdims = num_of_actions self.discount_factor = discount_factor ids, ods = self.inputdims, self.outputdims print('inputdims:{}, outputdims:{}'.format(ids, ods)) self.actor = self.create_actor_network(ids, ods) self.critic = self.create_critic_network(ids, ods) self.actor_target = self.create_actor_network(ids, ods) self.critic_target = self.create_critic_network(ids, ods) # print(self.actor.get_weights()) # print(self.critic.get_weights()) self.feed, self.joint_inference, sync_target = self.train_step_gen() sess = ct.get_session() sess.run(tf.global_variables_initializer()) sync_target()
class Game(object): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) mx = 0 LOG = 0 trainnum = 0 modelcnt = 0 noiselevel = 0.5 rpm = rpm(2000000) TAU = 0.001 lr_actor = 3e-4 lr_critic = 3e-4 train_interval = 1 train_times = 100 action_dim = 18 state_dim = 76 max_steps = 1000 // 4 cnt = 0 GAMMA = 0.96 BATCH_SIZE = 128 log_path = './logs' import threading as th lock = th.Lock() actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, lr_actor) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, lr_critic) callback = TensorBoard(log_path) callback.set_model(critic.model) def write_log(self, callback, names, logs, batch_no): output = open('logs/data.txt', 'w') output.write(str(self.LOG) + ' ' + str(self.trainnum)) output.close() for name, value in zip(names, itertools.repeat(logs)): summary = tf.Summary() summary_value = summary.value.add() summary_value.simple_value = value summary_value.tag = name callback.writer.add_summary(summary, batch_no) callback.writer.flush() callback = TensorBoard(self.log_path) def play(self, env, cnt): episode_memory = [] step = 0 s_t = env.reset() total_reward = 0. sp = 0. noise_t = np.zeros([1, self.action_dim]) a_t = np.zeros([1, self.action_dim]) noise = self.noiselevel self.noiselevel = noise * 0.999 for j in range(self.max_steps): self.lock.acquire() global graph with graph.as_default(): a_t_original = self.actor.model.predict(np.array([s_t])) self.lock.release() noise = noise * 0.98 if cnt % 3 == 0: if j % 5 == 0: noise_t[0] = np.random.randn(self.action_dim) * noise elif cnt % 3 == 1: if j % 5 == 0: noise_t[0] = np.random.randn(self.action_dim) * noise * 2 else: noise_t = np.zeros([1, self.action_dim]) a_t = a_t_original + noise_t for i in range(self.action_dim): if (a_t[0][i] > 1): a_t[0][i] = 1 elif (a_t[0][i] < 0): a_t[0][i] = 0 ob, r_t, done, _, pen = env.step(a_t[0]) s_t1 = ob episode_memory.append([s_t, a_t[0], r_t - pen, done, s_t1]) total_reward += r_t sp += pen s_t = s_t1 step += 1 if done or step == 1000 / 4 - 1: if total_reward > self.mx: self.mx = total_reward print("Episode", cnt, "Step", step, "Reward", total_reward, "max", self.mx, "penalty", sp) train_names = ['reward'] self.lock.acquire() self.LOG = self.LOG + 1 self.write_log(self.callback, train_names, total_reward, self.LOG) self.lock.release() break self.lock.acquire() for i in range(step): self.rpm.add(episode_memory[i]) self.lock.release() def playonce(self, env, T): from multi import fastenv fenv = fastenv(env, 4) self.play(fenv, T) env.rel() del fenv def play_ignore(self, env, T): import threading as th try: t = th.Thread(target=self.playonce, args=( env, T, )) t.setDaemon(True) t.start() except: print("startfail") def playifavailable(self, T): while True: remote_env = farmer.acq_env() if remote_env == False: pass else: self.play_ignore(remote_env, T) break def train(self): memory = self.rpm if memory.size() < self.BATCH_SIZE: return global graph loss = 0 for T in range(self.train_times): [states, actions, rewards, dones, new_states] = memory.sample_batch(self.BATCH_SIZE) y_t = np.asarray([0.0] * self.BATCH_SIZE) rewards = np.concatenate(rewards) self.lock.acquire() with graph.as_default(): target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) target_q_values = target_q_values.reshape( [1, target_q_values.shape[0]])[0] for k in range(self.BATCH_SIZE): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + self.GAMMA * target_q_values[k] with graph.as_default(): self.critic.model.optimizer.learning_rate = self.lr_critic logs = self.critic.model.train_on_batch([states, actions], y_t) a_for_grad = self.actor.model.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads, learning_rate=self.lr_actor) self.actor.target_train() self.critic.target_train() train_names = ['train_loss'] self.write_log(self.callback, train_names, logs, self.trainnum) self.trainnum = self.trainnum + 1 loss = loss + logs self.lock.release() print("train", memory.size(), loss) def save(self): self.modelcnt = self.modelcnt + 1 self.actor.target_model.save_weights("logs/actormodel.h5", overwrite=True) self.critic.target_model.save_weights("logs/criticmodel.h5", overwrite=True) self.actor.target_model.save_weights("logs/actormodel{}.h5".format( self.modelcnt)) self.critic.target_model.save_weights("logs/criticmodel{}.h5".format( self.modelcnt)) print("save") def pre(self): print("Now we load the weight") try: input = open('logs/data.txt', 'r') self.LOG, self.trainnum = map(int, input.read().split(' ')) print("LOG", self.LOG, "trainnum", self.trainnum) input.close() print("log found") self.critic.model.load_weights("logs/criticmodel.h5") self.critic.target_model.load_weights("logs/criticmodel.h5") self.actor.model.load_weights("logs/actormodel.h5") self.actor.target_model.load_weights("logs/actormodel.h5") print("Weight load successfully") self.rpm.load('logs/rpm.pickle') print("rmp success") except: if self.LOG > 0: print("Load fault") return False else: print("A new experiment") return True def run(self): np.random.seed(23333) episode_count = 10000 reward = 0 done = False LOSS = 0 for T in range(50): self.playifavailable(T) for T in range(episode_count): self.train() self.playifavailable(T) if np.mod(T, 100) == 0 and T >= 100: self.save() print("Finish.")