def test_param_sharing_with_create_parameter(self): """ Test case for parameter sharing of create_parameter op """ net = MyNetWork() main_program1 = fluid.Program() with fluid.program_guard(main_program1): x = layers.data(name='x', shape=[100], dtype="float32") out1 = x + net.created_param() main_program2 = fluid.Program() with fluid.program_guard(main_program2): x = layers.data(name='x', shape=[100], dtype="float32") out2 = x + net.created_param() place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) input_np = np.random.uniform(0, 1, [1, 100]).astype("float32") out1_np = exe.run(main_program1, feed={"x": input_np}, fetch_list=[out1])[0] out2_np = exe.run(main_program2, feed={"x": input_np}, fetch_list=[out2])[0] self.assertEqual(np.sum(out1_np.flatten()), np.sum(out2_np.flatten()))
def test_param_sharing_with_batch_norm(self): """ Test case for batch_norm layer """ net = MyNetWork() main_program1 = fluid.Program() with fluid.program_guard(main_program1): x = layers.data(name='x', shape=[32, 128, 128], dtype="float32") hid1 = net.fc1(x) out1 = net.batch_norm(hid1) main_program2 = fluid.Program() with fluid.program_guard(main_program2): x = layers.data(name='x', shape=[32, 128, 128], dtype="float32") hid1 = net.fc1(x) out2 = net.batch_norm(hid1) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) input_np = np.random.uniform(0, 1, [1, 32, 128, 128]).astype("float32") out1_np = exe.run(main_program1, feed={"x": input_np}, fetch_list=[out1])[0] out2_np = exe.run(main_program2, feed={"x": input_np}, fetch_list=[out2])[0] self.assertEqual(np.sum(out1_np.flatten()), np.sum(out2_np.flatten()))
def build_program(self): self.predict_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=[10], dtype='float32') output = self.alg.predict(obs) self.predict_output = [output] with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=[10], dtype='float32') label = layers.data(name='label', shape=[1], dtype='float32') cost = self.alg.learn(obs, label)
def test_sync_weights_with_batch_norm(self): model = TestModel3() target_model = deepcopy(model) program1 = fluid.Program() program2 = fluid.Program() with fluid.program_guard(program1): obs = layers.data(name='obs', shape=[32, 128, 128], dtype="float32") model_output = model.predict(obs) loss = layers.reduce_mean(model_output) optimizer = fluid.optimizer.AdamOptimizer(1e-3) optimizer.minimize(loss) with fluid.program_guard(program2): obs = layers.data(name='obs', shape=[32, 128, 128], dtype="float32") model_output = model.predict(obs) target_model_output = target_model.predict(obs) self.executor.run(fluid.default_startup_program()) N = 10 random_obs = np.random.random(size=(N, 32, 128, 128)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( program2, feed={'obs': x}, fetch_list=[model_output, target_model_output]) self.assertNotEqual(np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten())) # run optimizing to make parameters of batch_norm between model and target_model are different N = 100 random_obs = np.random.random(size=(N, 32, 128, 128)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) self.executor.run(program1, feed={'obs': x}) model.sync_weights_to(target_model) random_obs = np.random.random(size=(N, 32, 128, 128)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( program2, feed={'obs': x}, fetch_list=[model_output, target_model_output]) self.assertEqual(np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
def build_program(self): self.predict_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=[4], dtype='float32') self.predict_actions = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=[4], dtype='float32') actions = layers.data(name='actions', shape=[], dtype='int64') behaviour_logits = layers.data(name='behaviour_logits', shape=[2], dtype='float32') rewards = layers.data(name='rewards', shape=[], dtype='float32') dones = layers.data(name='dones', shape=[], dtype='float32') lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data(name='entropy_coeff', shape=[1], dtype='float32', append_batch_size=False) vtrace_loss, kl = self.alg.learn(obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff) self.learn_outputs = [ vtrace_loss.total_loss, vtrace_loss.pi_loss, vtrace_loss.vf_loss, vtrace_loss.entropy, kl ]
def build_program(self): self.predict_program = fluid.Program() self.value_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=[4], dtype='float32') self.predict_actions = self.alg.predict(obs) with fluid.program_guard(self.value_program): obs = layers.data(name='obs', shape=[4], dtype='float32') self.values = self.alg.value(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=[4], dtype='float32') actions = layers.data(name='actions', shape=[], dtype='int64') advantages = layers.data(name='advantages', shape=[], dtype='float32') target_values = layers.data(name='target_values', shape=[], dtype='float32') lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data(name='entropy_coeff', shape=[1], dtype='float32', append_batch_size=False) total_loss, pi_loss, vf_loss, entropy = self.alg.learn( obs, actions, advantages, target_values, lr, entropy_coeff) self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy]
def build_program(self): self.policy_predict_program = fluid.Program() self.policy_sample_program = fluid.Program() self.policy_learn_program = fluid.Program() self.value_predict_program = fluid.Program() self.value_learn_program = fluid.Program() with fluid.program_guard(self.policy_sample_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') print(obs) sampled_act = self.alg.sample(obs) self.policy_sample_output = [sampled_act] with fluid.program_guard(self.policy_predict_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') means = self.alg.predict(obs) self.policy_predict_output = [means] with fluid.program_guard(self.policy_learn_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') actions = layers.data(name='actions', shape=[self.act_dim], dtype='float32') advantages = layers.data(name='advantages', shape=[1], dtype='float32') if self.loss_type == 'KLPEN': beta = layers.data(name='beta', shape=[], dtype='float32') loss, kl = self.alg.policy_learn(obs, actions, advantages, beta) else: loss, kl = self.alg.policy_learn(obs, actions, advantages) self.policy_learn_output = [loss, kl] with fluid.program_guard(self.value_predict_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') value = self.alg.value_predict(obs) self.value_predict_output = [value] with fluid.program_guard(self.value_learn_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') val = layers.data(name='val', shape=[], dtype='float32') value_loss = self.alg.value_learn(obs, val) self.value_learn_output = [value_loss]
def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 obs = layers.data(name='obs', shape=self.obs_dim, dtype='float32') self.act_prob = self.alg.predict(obs) with fluid.program_guard( self.learn_program): # 搭建计算图用于 更新policy网络,定义输入输出变量 obs = layers.data(name='obs', shape=self.obs_dim, dtype='float32') act = layers.data(name='act', shape=[1], dtype='int64') reward = layers.data(name='reward', shape=[], dtype='float32') self.cost = self.alg.learn(obs, act, reward)
def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 last_obs = layers.data(name='last_obs', shape=[self.obs_dim], dtype='float32') obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') self.value = self.alg.predict(last_obs, obs) with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 last_obs = layers.data(name='last_obs', shape=[self.obs_dim], dtype='float32') obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') action = layers.data(name='act', shape=[1], dtype='int32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data(name='next_obs', shape=[self.obs_dim], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') self.cost = self.alg.learn(last_obs, obs, action, reward, next_obs, terminal)
def build_program(self): self.sample_program = fluid.Program() self.predict_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.sample_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') self.sample_actions, self.behaviour_logits = self.alg.sample(obs) with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') self.predict_actions = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') actions = layers.data(name='actions', shape=[], dtype='int64') behaviour_logits = layers.data(name='behaviour_logits', shape=[self.act_dim], dtype='float32') rewards = layers.data(name='rewards', shape=[], dtype='float32') dones = layers.data(name='dones', shape=[], dtype='float32') lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data(name='entropy_coeff', shape=[1], dtype='float32', append_batch_size=False) self.learn_reader = fluid.layers.create_py_reader_by_data( capacity=32, feed_list=[ obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff ]) obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff = fluid.layers.read_file( self.learn_reader) vtrace_loss, kl = self.alg.learn(obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff) self.learn_outputs = [ vtrace_loss.total_loss, vtrace_loss.pi_loss, vtrace_loss.vf_loss, vtrace_loss.entropy, kl ] self.learn_program = parl.compile(self.learn_program, vtrace_loss.total_loss)
def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() #train_program with fluid.program_guard(self.pred_program): obs = layers.data( name='obs', shape=[self.obs_dim], dtype='float32') self.act_prob = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data( name='obs', shape=[self.obs_dim], dtype='float32') act = layers.data(name='act', shape=[1], dtype='int64') reward = layers.data(name='reward', shape=[], dtype='float32') self.cost = self.alg.learn(obs, act, reward)
def test_sync_weights_with_decay(self): pred_program = fluid.Program() with fluid.program_guard(pred_program): obs = layers.data(name='obs', shape=[4], dtype='float32') model_output = self.model.predict(obs) target_model_output = self.target_model.predict(obs) self.executor.run(fluid.default_startup_program()) decay = 0.9 # update in numpy way (target_model_fc1_w, target_model_fc1_b, target_model_fc2_w, target_model_fc2_b, target_model_fc3_w, target_model_fc3_b) = self._numpy_update(self.target_model, decay) self.model.sync_weights_to(self.target_model, decay=decay) N = 10 random_obs = np.random.random(size=(N, 4)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) real_target_outputs = self.executor.run( pred_program, feed={'obs': x}, fetch_list=[target_model_output])[0] # Ideal target output out_np = np.dot(x, target_model_fc1_w) + target_model_fc1_b out_np = np.dot(out_np, target_model_fc2_w) + target_model_fc2_b out_np = np.dot(out_np, target_model_fc3_w) + target_model_fc3_b self.assertLess(float(np.abs(real_target_outputs - out_np)), 1e-5)
def test_sync_weights_with_create_parameter(self): model = TestModel2() target_model = deepcopy(model) pred_program = fluid.Program() with fluid.program_guard(pred_program): obs = layers.data(name='obs', shape=[100], dtype='float32') model_output = model.predict(obs) target_model_output = target_model.predict(obs) self.executor.run(fluid.default_startup_program()) N = 10 random_obs = np.random.random(size=(N, 100)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( pred_program, feed={'obs': x}, fetch_list=[model_output, target_model_output]) self.assertNotEqual(np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten())) model.sync_weights_to(target_model) random_obs = np.random.random(size=(N, 100)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( pred_program, feed={'obs': x}, fetch_list=[model_output, target_model_output]) self.assertEqual(np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
def create_inputs(self, mode): """create layers.data here""" inputs = OrderedDict() data_attributes = copy.deepcopy(self.data_attributes) data_attributes['click_id'] = { 'shape': (-1, 1), 'dtype': 'int64', 'lod_level': 1 } if mode in ['train', 'test']: list_names = self.item_slot_names + self.user_slot_names + [ 'click_id' ] elif mode in ['inference']: list_names = self.item_slot_names + self.user_slot_names else: raise NotImplementedError(mode) for name in list_names: p = data_attributes[name] inputs[name] = layers.data(name=name, shape=p['shape'], dtype=p['dtype'], lod_level=p['lod_level']) return inputs
def test_set_weights_between_different_models(self): model1 = TestModel4() model2 = TestModel4() pred_program = fluid.Program() with fluid.program_guard(pred_program): obs = layers.data(name='obs', shape=[4], dtype='float32') model1_output = model1.predict(obs) model2_output = model2.predict(obs) self.executor.run(fluid.default_startup_program()) N = 10 random_obs = np.random.random(size=(N, 4)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( pred_program, feed={'obs': x}, fetch_list=[model1_output, model2_output]) self.assertNotEqual(outputs[0].flatten(), outputs[1].flatten()) # pass parameters of self.model to model2 params = model1.get_weights() model2.set_weights(params) random_obs = np.random.random(size=(N, 4)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( pred_program, feed={'obs': x}, fetch_list=[model1_output, model2_output]) self.assertEqual(outputs[0].flatten(), outputs[1].flatten())
def test_sync_weights_in_one_program(self): pred_program = fluid.Program() with fluid.program_guard(pred_program): obs = layers.data(name='obs', shape=[4], dtype='float32') model_output = self.model.predict(obs) target_model_output = self.target_model.predict(obs) self.executor.run(fluid.default_startup_program()) N = 10 random_obs = np.random.random(size=(N, 4)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( pred_program, feed={'obs': x}, fetch_list=[model_output, target_model_output]) self.assertNotEqual(outputs[0].flatten(), outputs[1].flatten()) self.model.sync_weights_to(self.target_model) random_obs = np.random.random(size=(N, 4)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( pred_program, feed={'obs': x}, fetch_list=[model_output, target_model_output]) self.assertEqual(outputs[0].flatten(), outputs[1].flatten())
def build_program(self): self.predict_programs = [] self.predict_outputs = [] self.learn_programs = [] self.learn_programs_output = [] for i in range(self.ensemble_num): predict_program = fluid.Program() with fluid.program_guard(predict_program): obs = layers.data( name='obs', shape=[self.obs_dim], dtype='float32') act = self.alg.predict(obs, model_id=i) self.predict_programs.append(predict_program) self.predict_outputs.append([act.name]) learn_program = fluid.Program() with fluid.program_guard(learn_program): obs = layers.data( name='obs', shape=[self.obs_dim], dtype='float32') act = layers.data( name='act', shape=[self.act_dim], dtype='float32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data( name='next_obs', shape=[self.obs_dim], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') actor_lr = layers.data( name='actor_lr', shape=[1], dtype='float32', append_batch_size=False) critic_lr = layers.data( name='critic_lr', shape=[1], dtype='float32', append_batch_size=False) actor_loss, critic_loss = self.alg.learn( obs, act, reward, next_obs, terminal, actor_lr, critic_lr, model_id=i) self.learn_programs.append(learn_program) self.learn_programs_output.append([critic_loss.name]) self.ensemble_predict_program = fluid.Program() with fluid.program_guard(self.ensemble_predict_program): obs = layers.data( name='obs', shape=[self.obs_dim], dtype='float32') act = self.alg.ensemble_predict(obs) self.ensemble_predict_output = [act.name]
def build_program(self): self.sample_program = fluid.Program() self.predict_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.sample_program): obs = layers.data(name='obs', shape=self.obs_dim, dtype='float32') self.sample_actions, self.action_mean, self.action_std = self.alg.sample( obs) # with fluid.program_guard(self.predict_program): # obs = layers.data( # name='obs', shape=[self.obs_dim], dtype='float32') # self.predict_actions = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=self.obs_dim, dtype='float32') actions = layers.data(name='actions', shape=[self.act_dim], dtype='float32') mean = layers.data(name='mean', shape=[self.act_dim], dtype='float32') std = layers.data(name='std', shape=[self.act_dim], dtype='float32') rewards = layers.data(name='rewards', shape=[], dtype='float32') dones = layers.data(name='dones', shape=[], dtype='float32') lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data(name='entropy_coeff', shape=[], dtype='float32') self.learn_reader = fluid.layers.create_py_reader_by_data( capacity=32, feed_list=[ obs, actions, mean, std, rewards, dones, lr, entropy_coeff ]) obs, actions, mean, std, rewards, dones, lr, entropy_coeff = fluid.layers.read_file( self.learn_reader) vtrace_loss, kl = self.alg.learn(obs, actions, mean, std, rewards, dones, lr, entropy_coeff) self.learn_outputs = [ vtrace_loss.total_loss, vtrace_loss.pi_loss, vtrace_loss.vf_loss, vtrace_loss.entropy, kl ]
def _define_program(self): self.ensemble_predict_program = fluid.Program() self.startup_program = fluid.Program() with fluid.program_guard(self.ensemble_predict_program, self.startup_program): obs = layers.data(name='obs', shape=[OBS_DIM], dtype='float32') action = self._ensemble_predict(obs) self.ensemble_predict_output = [action]
def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') self.value = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') action = layers.data(name='act', shape=[1], dtype='int32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data(name='next_obs', shape=[self.obs_dim], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') sample_weight = layers.data(name='sample_weight', shape=[1], dtype='float32') self.cost, self.delta = self.alg.learn(obs, action, reward, next_obs, terminal, sample_weight)
def build_program(self): self.learn_programs = [] self.predict_programs=[] self.pred_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data( name='obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') self.value = self.alg.define_predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data( name='obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') action = layers.data(name='act', shape=[1], dtype='int32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data( name='next_obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') weight=layers.data(name='weight', shape=[], dtype='float32') self.cost,self.newTd = self.alg.define_learn(obs, action, reward, next_obs,terminal,weight) self.learn_programs.append(self.learn_program) self.predict_programs.append(self.pred_program)
def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data( name='obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') self.value = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data( name='obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') action = layers.data(name='act', shape=[1], dtype='int32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data( name='next_obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False) terminal = layers.data(name='terminal', shape=[], dtype='bool') self.cost = self.alg.learn(obs, action, reward, next_obs, terminal, lr)
def build_program(self): self.pred_program = fluid.Program() self.sample_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') self.pred_act = self.alg.predict(obs) with fluid.program_guard(self.sample_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') self.sample_act, _ = self.alg.sample(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') act = layers.data(name='act', shape=[self.act_dim], dtype='float32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data(name='next_obs', shape=[self.obs_dim], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') self.critic_cost, self.actor_cost = self.alg.learn( obs, act, reward, next_obs, terminal)
def build_program(self): self.predict_program = fluid.Program() with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=[self.config['obs_dim']], dtype='float32') self.predict_action = self.alg.predict(obs) self.predict_program = parl.compile(self.predict_program)
def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() self.next_q_program = fluid.Program() self.next_a_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data( name='obs', shape=[self.obs_dim_n[self.agent_index]], dtype='float32') self.pred_act = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs_n = [ layers.data( name='obs' + str(i), shape=[self.obs_dim_n[i]], dtype='float32') for i in range(self.n) ] act_n = [ layers.data( name='act' + str(i), shape=[self.act_dim_n[i]], dtype='float32') for i in range(self.n) ] target_q = layers.data(name='target_q', shape=[], dtype='float32') self.critic_cost = self.alg.learn(obs_n, act_n, target_q) with fluid.program_guard(self.next_q_program): obs_n = [ layers.data( name='obs' + str(i), shape=[self.obs_dim_n[i]], dtype='float32') for i in range(self.n) ] act_n = [ layers.data( name='act' + str(i), shape=[self.act_dim_n[i]], dtype='float32') for i in range(self.n) ] self.next_Q = self.alg.Q_next(obs_n, act_n) with fluid.program_guard(self.next_a_program): obs = layers.data( name='obs', shape=[self.obs_dim_n[self.agent_index]], dtype='float32') self.next_action = self.alg.predict_next(obs) if self.speedup: self.pred_program = parl.compile(self.pred_program) self.learn_program = parl.compile(self.learn_program, self.critic_cost) self.next_q_program = parl.compile(self.next_q_program) self.next_a_program = parl.compile(self.next_a_program)
def build_program(self): self.pred_program = fluid.Program() self.train_program = fluid.Program() fluid.default_startup_program().random_seed = self.seed self.train_program.random_seed = self.seed with fluid.program_guard(self.pred_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') self.act_prob = self.alg.define_predict(obs) with fluid.program_guard(self.train_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') act = layers.data(name='act', shape=[1], dtype='int64') reward = layers.data(name='reward', shape=[], dtype='float32') self.cost = self.alg.define_learn(obs, act, reward)
def test_set_weights_with_wrong_params_num(self): pred_program = fluid.Program() with fluid.program_guard(pred_program): obs = layers.data(name='obs', shape=[4], dtype='float32') model_output = self.model.predict(obs) self.executor.run(fluid.default_startup_program()) params = self.model.get_weights() with self.assertRaises(AssertionError): self.model.set_weights(params[1:])
def _define_program(self): self.actor_predict_programs = [] self.actor_predict_outputs = [] self.learn_programs = [] self.learn_programs_output = [] for i in range(self.ensemble_num): actor_predict_program = fluid.Program() with fluid.program_guard(actor_predict_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') action = self.alg.actor_predict(obs, model_id=i) self.actor_predict_programs.append(actor_predict_program) self.actor_predict_outputs.append([action]) learn_program = fluid.Program() with fluid.program_guard(learn_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') action = layers.data(name='action', shape=[self.act_dim], dtype='float32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data(name='next_obs', shape=[self.obs_dim], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') actor_lr = layers.data(name='actor_lr', shape=[1], dtype='float32', append_batch_size=False) critic_lr = layers.data(name='critic_lr', shape=[1], dtype='float32', append_batch_size=False) critic_loss = self.alg.learn(obs, action, reward, next_obs, terminal, actor_lr, critic_lr, model_id=i) if self._no_mem_allocation: for var in learn_program.blocks[0].vars: if not learn_program.blocks[0].var(var).is_data: learn_program.blocks[0].var(var).persistable = True self.learn_programs.append(learn_program) self.learn_programs_output.append([critic_loss.name])
def build_program(self): self._pred_program = fluid.Program() self._learn_program = fluid.Program() with fluid.program_guard(self._pred_program): obs = layers.data( name = 'obs', shape = [self._obs_dim], dtype = 'float32' ) self._value = self.alg.define_predict(obs) with fluid.program_guard(self._learn_program): obs = layers.data( name = 'obs', shape = [self._obs_dim], dtype = 'float32' ) action = layers.data(name='act', shape=[1], dtype='int32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data( name = 'next_obs', shape = [self._obs_dim], dtype = 'float32' ) terminal = layers.data(name='terminal', shape=[], dtype='bool') self._cost = self.alg.define_learn(obs, action, reward, next_obs, terminal)
def build_program(self): self.sample_program = fluid.Program() self.predict_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.sample_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') sample_actions, values = self.alg.sample(obs) self.sample_outputs = [sample_actions.name, values.name] with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') self.predict_actions = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') actions = layers.data(name='actions', shape=[], dtype='int64') advantages = layers.data(name='advantages', shape=[], dtype='float32') target_values = layers.data(name='target_values', shape=[], dtype='float32') lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data(name='entropy_coeff', shape=[], dtype='float32') self.learn_reader = fluid.layers.create_py_reader_by_data( capacity=32, feed_list=[ obs, actions, advantages, target_values, lr, entropy_coeff ]) obs, actions, advantages, target_values, lr, entropy_coeff = fluid.layers.read_file( self.learn_reader) total_loss, pi_loss, vf_loss, entropy = self.alg.learn( obs, actions, advantages, target_values, lr, entropy_coeff) self.learn_outputs = [ total_loss.name, pi_loss.name, vf_loss.name, entropy.name ]