def __init__(self): self._client_fwd = ForwardModel(fwd_model_uri) self._client = GameState(uri) self._client.set_game_tick_callback(self._on_game_tick) self._client_fwd.set_next_state_callback(self._on_next_game_state) self.connect()
def __init__(self, name): self.name = name self.calibrated = rospy.get_param("~" + self.name + "/calibrated") self.signEncoder = rospy.get_param("~" + self.name + "/signEncoder") self.signJoint = rospy.get_param("~" + self.name + "/signJoint") self.name = rospy.get_param("~" + self.name + "/name") self.nameEncoder = rospy.get_param("~" + self.name + "/nameEncoder") minAngle = rospy.get_param("~" + self.name + "/minAngle") maxAngle = rospy.get_param("~" + self.name + "/maxAngle") self.pGain = rospy.get_param("~" + self.name + "/gains/P") self.iGain = rospy.get_param("~" + self.name + "/gains/I") self.vGain = rospy.get_param("~" + self.name + "/gains/D") self.maxAbsForwardError = rospy.get_param("~" + self.name + "/maxAbsForwardError") # Attribute stores resting pose angle of antagonist-pair motor joints # taken from joint yaml file in gummi_base and gummi_ee pkgs. self.restingPoseAngle = rospy.get_param("~" + self.name + "/restingPoseAngle") self.range = maxAngle - minAngle self.angle = JointAngle(self.nameEncoder, self.signEncoder, minAngle, maxAngle, True) self.eqModel = EquilibriumModel(self.name) self.inverseModel = InverseModel(self.name) self.inverseModelCollision = InverseModel(self.name) self.forwardModel = ForwardModel(self.name) if self.calibrated is 1: self.inverseModel.loadCalibration() self.inverseModelCollision.loadCalibration() self.forwardModel.loadCalibration() self.cocontractionReflex = Reflex(2.0, 0.0045, 0.0) self.feedbackReflex = Reflex(1.0, 0.0075, 0.0) self.collisionReflex = Reflex(1.0, 0.0075, 0.0) self.initPublishers() self.initVariables() self.disableEncoderTorque() jointRange = self.angle.getMax() - self.angle.getMin() self.eqModel.calculateEqVelCalibration(jointRange)
def __init__(self, name): self.name = name self.calibrated = rospy.get_param("~" + self.name + "/calibrated") self.signEncoder = rospy.get_param("~" + self.name + "/signEncoder") self.signJoint = rospy.get_param("~" + self.name + "/signJoint") self.name = rospy.get_param("~" + self.name + "/name") self.nameEncoder = rospy.get_param("~" + self.name + "/nameEncoder") minAngle = rospy.get_param("~" + self.name + "/minAngle") maxAngle = rospy.get_param("~" + self.name + "/maxAngle") self.pGain = rospy.get_param("~" + self.name + "/gains/P") self.iGain = rospy.get_param("~" + self.name + "/gains/I") self.vGain = rospy.get_param("~" + self.name + "/gains/D") self.maxAbsForwardError = rospy.get_param("~" + self.name + "/maxAbsForwardError") self.range = maxAngle - minAngle self.angle = JointAngle(self.nameEncoder, self.signEncoder, minAngle, maxAngle, True) self.eqModel = EquilibriumModel(self.name) self.inverseModel = InverseModel(self.name) self.inverseModelCollision = InverseModel(self.name) self.forwardModel = ForwardModel(self.name) if self.calibrated is 1: self.inverseModel.loadCalibration() self.inverseModelCollision.loadCalibration() self.forwardModel.loadCalibration() self.cocontractionReflex = Reflex(2.0, 0.0015, 0.0) self.feedbackReflex = Reflex(1.0, 0.0075, 0.0) self.collisionReflex = Reflex(1.0, 0.0075, 0.0) self.initPublishers() self.initVariables() self.disableEncoderTorque() jointRange = self.angle.getMax() - self.angle.getMin() self.eqModel.calculateEqVelCalibration(jointRange)
class Antagonist: def __init__(self, name): self.name = name self.calibrated = rospy.get_param("~" + self.name + "/calibrated") self.signEncoder = rospy.get_param("~" + self.name + "/signEncoder") self.signJoint = rospy.get_param("~" + self.name + "/signJoint") self.name = rospy.get_param("~" + self.name + "/name") self.nameEncoder = rospy.get_param("~" + self.name + "/nameEncoder") minAngle = rospy.get_param("~" + self.name + "/minAngle") maxAngle = rospy.get_param("~" + self.name + "/maxAngle") self.pGain = rospy.get_param("~" + self.name + "/gains/P") self.iGain = rospy.get_param("~" + self.name + "/gains/I") self.vGain = rospy.get_param("~" + self.name + "/gains/D") self.maxAbsForwardError = rospy.get_param("~" + self.name + "/maxAbsForwardError") self.range = maxAngle - minAngle self.angle = JointAngle(self.nameEncoder, self.signEncoder, minAngle, maxAngle, True) self.eqModel = EquilibriumModel(self.name) self.inverseModel = InverseModel(self.name) self.inverseModelCollision = InverseModel(self.name) self.forwardModel = ForwardModel(self.name) if self.calibrated is 1: self.inverseModel.loadCalibration() self.inverseModelCollision.loadCalibration() self.forwardModel.loadCalibration() self.cocontractionReflex = Reflex(2.0, 0.0015, 0.0) self.feedbackReflex = Reflex(1.0, 0.0075, 0.0) self.collisionReflex = Reflex(1.0, 0.0075, 0.0) self.initPublishers() self.initVariables() self.disableEncoderTorque() jointRange = self.angle.getMax() - self.angle.getMin() self.eqModel.calculateEqVelCalibration(jointRange) def initVariables(self): self.errors = deque() self.velocity = False self.closedLoop = False self.feedForward = False self.collisionResponse = False self.errorLast = 0.0 self.ballistic = 0.0 self.deltaAngleBallistic = 0.0 self.deltaEqFeedback = 0.0 self.lastForwardError = 0.0 self.forwardError = 0.0 self.ballisticRatio = 0.85 self.feedbackRatio = 0.5 def disableEncoderTorque(self): service_name = self.nameEncoder + "_controller/torque_enable" rospy.wait_for_service(service_name) try: te = rospy.ServiceProxy(service_name, TorqueEnable) te(torque_enable=False) except rospy.ServiceException, e: print "Service call failed: %s" % e
def __init__(self, environment): self.env = environment # Create placeholders for all the inputs self.states_ = tf.placeholder( "float", shape=(None, ) + self.env.state_size, name='states_') # Batch x State, previous state self.states = tf.placeholder( "float", shape=(None, ) + self.env.state_size, name='states') # Batch x State, current_state self.actions = tf.placeholder("float", shape=(None, self.env.action_size), name='action') # Batch x Action self.label = tf.placeholder("float", shape=(None, 1), name='label') self.gamma = tf.placeholder("float", shape=(), name='gamma') self.temp = tf.placeholder("float", shape=(), name='temperature') self.noise = tf.placeholder("float", shape=(), name='noise_flag') self.do_keep_prob = tf.placeholder("float", shape=(), name='do_keep_prob') if self.env.use_airl: self.done_ph = tf.placeholder(name="dones", shape=(None, ), dtype=tf.float32) # Create MGAIL blocks self.forward_model = ForwardModel( state_size=self.env.state_size[0] if self.env.obs_mode == 'state' else self.env.encoder_feat_size, action_size=self.env.action_size, encoding_size=self.env.fm_size, lr=self.env.fm_lr, forward_model_type=self.env.forward_model_type, obs_mode=self.env.obs_mode, use_scale_dot_product=self.env.use_scale_dot_product, use_skip_connection=self.env.use_skip_connection, use_dropout=self.env.use_dropout) if self.env.obs_mode == 'pixel': if self.env.state_only: feat_in_dim = 1024 # self.env.encoder_feat_size[0] policy_input_feat = 1024 else: feat_in_dim = 1024 + self.env.action_size # self.env.encoder_feat_size[0] policy_input_feat = 1024 else: if self.env.state_only: feat_in_dim = self.env.state_size[0] policy_input_feat = self.env.state_size[0] else: feat_in_dim = self.env.state_size[0] + self.env.action_size policy_input_feat = self.env.state_size[0] self.discriminator = Discriminator( in_dim=feat_in_dim, out_dim=self.env.disc_out_dim, size=self.env.d_size, lr=self.env.d_lr, do_keep_prob=self.do_keep_prob, weight_decay=self.env.weight_decay, use_airl=self.env.use_airl, phi_hidden_size=self.env.phi_size, state_only=self.env.state_only, ) self.policy = Policy(in_dim=policy_input_feat, out_dim=self.env.action_size, size=self.env.p_size, lr=self.env.p_lr, do_keep_prob=self.do_keep_prob, n_accum_steps=self.env.policy_accum_steps, weight_decay=self.env.weight_decay) # Create experience buffers self.er_agent = ER( memory_size=self.env.er_agent_size, state_dim=self.env.state_size, action_dim=self.env.action_size, reward_dim=1, # stub connection qpos_dim=self.env.qpos_size, qvel_dim=self.env.qvel_size, batch_size=self.env.batch_size, history_length=1) self.er_expert = common.load_er(fname=os.path.join( self.env.run_dir, self.env.expert_data), batch_size=self.env.batch_size, history_length=1, traj_length=2) self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity if self.env.obs_mode == 'pixel': current_states = ops.preprocess(self.states, bits=8) current_states_feat = ops.encoder(current_states, reuse=tf.AUTO_REUSE) prev_states = ops.preprocess(self.states_, bits=8) prev_states_feat = ops.encoder(prev_states, reuse=tf.AUTO_REUSE) else: # Normalize the inputs prev_states = common.normalize(self.states_, self.er_expert.states_mean, self.er_expert.states_std) current_states = common.normalize(self.states, self.er_expert.states_mean, self.er_expert.states_std) prev_states_feat = prev_states current_states_feat = current_states if self.env.continuous_actions: actions = common.normalize(self.actions, self.er_expert.actions_mean, self.er_expert.actions_std) else: actions = self.actions # 1. Forward Model initial_gru_state = np.ones((1, self.forward_model.encoding_size)) forward_model_prediction, _, divergence_loss = self.forward_model.forward( [prev_states_feat, actions, initial_gru_state]) if self.env.obs_mode == 'pixel': forward_model_prediction = ops.decoder( forward_model_prediction, data_shape=self.env.state_size, reuse=tf.AUTO_REUSE) self.forward_model_prediction = ops.postprocess( forward_model_prediction, bits=8, dtype=tf.uint8) else: self.forward_model_prediction = forward_model_prediction forward_model_loss = tf.reduce_mean( tf.square(current_states - forward_model_prediction) ) + self.env.forward_model_lambda * tf.reduce_mean(divergence_loss) self.forward_model.train(objective=forward_model_loss) if self.env.use_airl: # 1.1 action log prob logits = self.policy.forward(current_states_feat) if self.env.continuous_actions: mean, logstd = logits, tf.log(tf.ones_like(logits)) std = tf.exp(logstd) n_elts = tf.cast(tf.reduce_prod(mean.shape[1:]), tf.float32) # first dimension is batch size log_normalizer = n_elts / 2. * (np.log(2 * np.pi).astype( np.float32)) + 1 / 2 * tf.reduce_sum(logstd, axis=1) # Diagonal Gaussian action probability, for every action action_logprob = -tf.reduce_sum(tf.square(actions - mean) / (2 * std), axis=1) - log_normalizer else: # Override since the implementation of tfp.RelaxedOneHotCategorical # yields positive values. if actions.shape[1:] != logits.shape[1:]: actions = tf.cast(actions, tf.int8) values = tf.one_hot(actions, logits.shape.as_list()[-1], dtype=tf.float32) assert values.shape == logits.shape, (values.shape, logits.shape) else: values = actions # [0]'s implementation (see line below) seems to be an approximation # to the actual Gumbel Softmax density. # TODO: to confirm 'action' or 'value' action_logprob = -tf.reduce_sum( -values * tf.nn.log_softmax(logits, axis=-1), axis=-1) # prob = logit[np.arange(self.action_test.shape[0]), self.action_test] # action_logprob = tf.log(prob) # 2. Discriminator self.discriminator.airl_entropy_weight = self.env.airl_entropy_weight # labels = tf.concat([1 - self.label, self.label], 1) # labels = 1 - self.label # 0 for expert, 1 for policy labels = self.label # 1 for expert, 0 for policy d, self.disc_shaped_reward_output, self.disc_reward = self.discriminator.forward( state=current_states_feat, action=actions, prev_state=prev_states_feat, done_inp=self.done_ph, log_policy_act_prob=action_logprob, ) # 2.1 0-1 accuracy correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean( tf.cast(correct_predictions, "float")) # 2.2 prediction d_cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=d, name="disc_loss", ) # Construct generator reward: # \[\hat{r}(s,a) = \log(D_{\theta}(s,a)) - \log(1 - D_{\theta}(s,a)).\] # This simplifies to: # \[\hat{r}(s,a) = f_{\theta}(s,a) - \log \pi(a \mid s).\] # This is just an entropy-regularized objective # ent_bonus = -self.env.airl_entropy_weight * self.discriminator.log_policy_act_prob_ph # policy_train_reward = self.discriminator.reward_net.reward_output_train + ent_bonus else: # 2. Discriminator labels = tf.concat([1 - self.label, self.label], 1) d, _, _ = self.discriminator.forward(state=current_states_feat, action=actions) # 2.1 0-1 accuracy correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean( tf.cast(correct_predictions, "float")) # 2.2 prediction d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=d, labels=labels) # cost sensitive weighting (weight true=expert, predict=agent mistakes) d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\ tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy) discriminator_loss = tf.reduce_mean(d_loss_weighted) self.discriminator.train(objective=discriminator_loss) # 3. Collect experience mu = self.policy.forward(current_states_feat) if self.env.continuous_actions: a = common.denormalize(mu, self.er_expert.actions_mean, self.er_expert.actions_std) eta = tf.random_normal(shape=tf.shape(a), stddev=self.env.sigma) self.action_test = tf.squeeze(a + self.noise * eta) else: a = common.gumbel_softmax(logits=mu, temperature=self.temp) self.action_test = tf.argmax(a, dimension=1) # 4.3 AL def policy_loop(current_state_policy_update, t, total_cost, total_trans_err, env_term_sig, prev_state): if self.env.obs_mode == 'pixel': current_state_feat_policy_update = ops.encoder( current_state_policy_update, reuse=True) prev_state_feat_policy_update = ops.encoder(prev_state, reuse=True) else: current_state_feat_policy_update = current_state_policy_update prev_state_feat_policy_update = prev_state mu = self.policy.forward(current_state_feat_policy_update, reuse=True) if self.env.continuous_actions: eta = self.env.sigma * tf.random_normal(shape=tf.shape(mu)) action = mu + eta if self.env.use_airl: mean, logstd = mu, tf.log( tf.ones_like(mu) * self.env.sigma) std = tf.exp(logstd) n_elts = tf.cast( tf.reduce_prod(mean.shape[1:]), tf.float32) # first dimension is batch size log_normalizer = n_elts / 2. * (np.log(2 * np.pi).astype( np.float32)) + 1 / 2 * tf.reduce_sum(logstd, axis=1) # Diagonal Gaussian action probability, for every action action_logprob = -tf.reduce_sum(tf.square(action - mean) / (2 * std), axis=1) - log_normalizer else: action = common.gumbel_softmax_sample(logits=mu, temperature=self.temp) if self.env.use_airl: # Override since the implementation of tfp.RelaxedOneHotCategorical # yields positive values. if action.shape[1:] != logits.shape[1:]: actions = tf.cast(action, tf.int8) values = tf.one_hot(actions, logits.shape.as_list()[-1], dtype=tf.float32) assert values.shape == logits.shape, (values.shape, logits.shape) else: values = action # [0]'s implementation (see line below) seems to be an approximation # to the actual Gumbel Softmax density. # TODO: to confirm 'action' or 'value' action_logprob = -tf.reduce_sum( -values * tf.nn.log_softmax(logits, axis=-1), axis=-1) # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1]) if self.env.use_airl: d, shaped_reward_output, reward = self.discriminator.forward( state=current_state_feat_policy_update, action=action, prev_state=prev_state_feat_policy_update, done_inp=tf.cast(env_term_sig, tf.float32), log_policy_act_prob=action_logprob, reuse=True) if self.env.alg in ['mairlTransfer', 'mairlImit4Transfer']: reward_for_updating_policy = reward else: # 'mairlImit' reward_for_updating_policy = shaped_reward_output if self.env.train_mode and not self.env.alg in [ 'mairlTransfer', 'mairlImit4Transfer' ]: ent_bonus = -self.env.airl_entropy_weight * tf.stop_gradient( action_logprob) policy_reward = reward_for_updating_policy + ent_bonus else: policy_reward = reward_for_updating_policy cost = tf.reduce_mean(-policy_reward) * self.env.policy_al_w else: d, _, _ = self.discriminator.forward( state=current_state_feat_policy_update, action=action, reuse=True) cost = self.al_loss(d) # add step cost total_cost += tf.multiply(tf.pow(self.gamma, t), cost) # get action if self.env.continuous_actions: a_sim = common.denormalize(action, self.er_expert.actions_mean, self.er_expert.actions_std) else: a_sim = tf.argmax(action, dimension=1) # get next state state_env, _, env_term_sig, = self.env.step(a_sim, mode='tensorflow')[:3] state_e = common.normalize(state_env, self.er_expert.states_mean, self.er_expert.states_std) state_e = tf.stop_gradient(state_e) state_a, _, divergence_loss_a = self.forward_model.forward( [current_state_feat_policy_update, action, initial_gru_state], reuse=True) if self.env.obs_mode == 'pixel': state_a = ops.decoder(state_a, data_shape=self.env.state_size, reuse=True) if True: # self.env.alg in ['mgail']: state, nu = common.re_parametrization(state_e=state_e, state_a=state_a) else: _, nu = common.re_parametrization(state_e=state_e, state_a=state_a) state = state_a total_trans_err += tf.reduce_mean(abs(nu)) t += 1 if self.env.obs_mode == 'pixel': state = tf.slice(state, [0, 0, 0, 0], [1, -1, -1, -1]) return state, t, total_cost, total_trans_err, env_term_sig, current_state_policy_update def policy_stop_condition(current_state_policy_update, t, cost, trans_err, env_term_sig, prev_state): cond = tf.logical_not( env_term_sig) # not done: env_term_sig = False cond = tf.logical_and(cond, t < self.env.n_steps_train) cond = tf.logical_and(cond, trans_err < self.env.total_trans_err_allowed) return cond if self.env.obs_mode == 'pixel': state_0 = tf.slice(current_states, [0, 0, 0, 0], [1, -1, -1, -1]) else: state_0 = tf.slice(current_states, [0, 0], [1, -1]) # prev_state_0 = tf.slice(states_, [0, 0], [1, -1]) loop_outputs = tf.while_loop(policy_stop_condition, policy_loop, [state_0, 0., 0., 0., False, state_0]) self.policy.train(objective=loop_outputs[2])
class Agent(): def __init__(self): self._client_fwd = ForwardModel(fwd_model_uri) self._client = GameState(uri) self._client.set_game_tick_callback(self._on_game_tick) self._client_fwd.set_next_state_callback(self._on_next_game_state) self.connect() def connect(self): loop = asyncio.get_event_loop() client_connection = loop.run_until_complete(self._client.connect()) client_fwd_connection = None client_fwd_connection = loop.run_until_complete( self._client_fwd.connect()) loop = asyncio.get_event_loop() loop.create_task(self._client._handle_messages(client_connection)) loop.create_task( self._client_fwd._handle_messages(client_fwd_connection)) loop.run_forever() def _get_bomb_to_detonate(self, game_state) -> [int, int] or None: agent_number = game_state.get("connection").get("agent_number") entities = self._client._state.get("entities") bombs = list( filter( lambda entity: entity.get("owner") == agent_number and entity. get("type") == "b", entities)) bomb = next(iter(bombs or []), None) if bomb != None: return [bomb.get("x"), bomb.get("y")] else: return None async def _on_game_tick(self, tick_number, game_state): random_action = self.generate_random_action() if random_action in ["up", "left", "right", "down"]: await self._client.send_move(random_action) elif random_action == "bomb": await self._client.send_bomb() elif random_action == "detonate": bomb_coordinates = self._get_bomb_to_detonate(game_state) if bomb_coordinates != None: x, y = bomb_coordinates await self._client.send_detonate(x, y) else: print(f"Unhandled action: {random_action}") def generate_random_action(self): actions_length = len(actions) return actions[random.randint(0, actions_length - 1)] async def _on_next_game_state(self, state): # print(state) pass def generate_random_action(self): actions_length = len(actions) return actions[random.randint(0, actions_length - 1)]
def __init__(self, environment, use_irl=False): self.use_irl = use_irl self.env = environment # Create placeholders for all the inputs self.states_ = tf.compat.v1.placeholder("float", shape=(None, self.env.state_size), name='states_') # Batch x State self.states = tf.compat.v1.placeholder("float", shape=(None, self.env.state_size), name='states') # Batch x State self.actions = tf.compat.v1.placeholder("float", shape=(None, self.env.action_size), name='action') # Batch x Action self.label = tf.compat.v1.placeholder("float", shape=(None, 1), name='label') self.gamma = tf.compat.v1.placeholder("float", shape=(), name='gamma') self.temp = tf.compat.v1.placeholder("float", shape=(), name='temperature') self.noise = tf.compat.v1.placeholder("float", shape=(), name='noise_flag') self.do_keep_prob = tf.compat.v1.placeholder("float", shape=(), name='do_keep_prob') self.lprobs = tf.compat.v1.placeholder('float', shape=(None, 1), name='log_probs') # Create MGAIL blocks self.forward_model = ForwardModel(state_size=self.env.state_size, action_size=self.env.action_size, encoding_size=self.env.fm_size, lr=self.env.fm_lr) # MODIFYING THE NEW DISCRIMINATOR: if self.use_irl: self.discriminator = DiscriminatorIRL(in_dim=self.env.state_size + self.env.action_size, out_dim=1, size=self.env.d_size, lr=self.env.d_lr, do_keep_prob=self.do_keep_prob, weight_decay=self.env.weight_decay, state_only=True, gamma=self.gamma, state_size = self.env.state_size, action_size = self.env.action_size) # END MODIFYING THE NEW DISCRIMINATOR else: self.discriminator = Discriminator(in_dim=self.env.state_size + self.env.action_size, out_dim=2, size=self.env.d_size, lr=self.env.d_lr, do_keep_prob=self.do_keep_prob, weight_decay=self.env.weight_decay) self.policy = Policy(in_dim=self.env.state_size, out_dim=self.env.action_size, size=self.env.p_size, lr=self.env.p_lr, do_keep_prob=self.do_keep_prob, n_accum_steps=self.env.policy_accum_steps, weight_decay=self.env.weight_decay) # Create experience buffers self.er_agent = ER(memory_size=self.env.er_agent_size, state_dim=self.env.state_size, action_dim=self.env.action_size, batch_size=self.env.batch_size, history_length=1) self.er_expert = common.load_d4rl_er(h5path=os.path.join(self.env.run_dir, self.env.expert_data), batch_size=self.env.batch_size, history_length=1, traj_length=2) self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity # Normalize the inputs states_ = common.normalize(self.states_, self.er_expert.states_mean, self.er_expert.states_std) states = common.normalize(self.states, self.er_expert.states_mean, self.er_expert.states_std) if self.env.continuous_actions: actions = common.normalize(self.actions, self.er_expert.actions_mean, self.er_expert.actions_std) else: actions = self.actions # 1. Forward Model initial_gru_state = np.ones((1, self.forward_model.encoding_size)) forward_model_prediction, _ = self.forward_model.forward([states_, actions, initial_gru_state]) forward_model_loss = tf.reduce_mean(tf.square(states-forward_model_prediction)) self.forward_model.train(objective=forward_model_loss) # 2. Discriminator labels = tf.concat([1 - self.label, self.label], 1) lprobs = self.lprobs # MODIFIED DISCRIMINATOR SECTION if self.use_irl: self.discrim_output, log_p_tau, log_q_tau, log_pq = self.discriminator.forward(states_, actions, states, lprobs) correct_predictions = tf.equal(tf.cast(tf.round(self.discrim_output), tf.int64), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float")) d_cross_entropy = self.label*(log_p_tau-log_pq) + (1-self.label)*(log_q_tau-log_pq) d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\ tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy) discriminator_loss = -tf.reduce_mean(d_loss_weighted) self.discriminator.train(objective=discriminator_loss) # END MODIFIED DISCRIMINATOR SECTION else: d = self.discriminator.forward(states, actions) # 2.1 0-1 accuracy correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float")) # 2.2 prediction d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=d, labels=labels) # cost sensitive weighting (weight true=expert, predict=agent mistakes) d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\ tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy) discriminator_loss = tf.reduce_mean(d_loss_weighted) self.discriminator.train(objective=discriminator_loss) # 3. Collect experience mu = self.policy.forward(states) if self.env.continuous_actions: a = common.denormalize(mu, self.er_expert.actions_mean, self.er_expert.actions_std) eta = tf.random.normal(shape=tf.shape(a), stddev=self.env.sigma) self.action_test = a + self.noise * eta # self.action_means = mu N = tf.shape(self.action_test)[0] expanded_sigma= tf.repeat(tf.expand_dims(tf.cast(self.env.sigma, dtype=tf.float32), 0), N, axis=0) self.action_probs_test = common.compute_action_probs_tf(self.action_test, mu, expanded_sigma) else: a = common.gumbel_softmax(logits=mu, temperature=self.temp) self.action_test = tf.compat.v1.argmax(a, dimension=1) self.action_means = tf.squeeze(mu) # 4.3 AL def policy_loop(state_, t, total_cost, total_trans_err, _): mu = self.policy.forward(state_, reuse=True) if self.env.continuous_actions: eta = self.env.sigma * tf.random.normal(shape=tf.shape(mu)) action = mu + eta N = tf.shape(action)[0] expanded_sigma= tf.repeat(tf.expand_dims(tf.cast(self.env.sigma, dtype=tf.float32), 0), N, axis=0) a_prob = common.compute_action_probs_tf(action, mu, expanded_sigma) else: action = common.gumbel_softmax_sample(logits=mu, temperature=self.temp) a_prob = 0.5 # get action if self.env.continuous_actions: a_sim = common.denormalize(action, self.er_expert.actions_mean, self.er_expert.actions_std) else: a_sim = tf.compat.v1.argmax(action, dimension=1) # get next state state_env, _, env_term_sig, = self.env.step(a_sim, mode='tensorflow')[:3] state_e = common.normalize(state_env, self.er_expert.states_mean, self.er_expert.states_std) state_e = tf.stop_gradient(state_e) state_a, _ = self.forward_model.forward([state_, action, initial_gru_state], reuse=True) state, nu = common.re_parametrization(state_e=state_e, state_a=state_a) total_trans_err += tf.reduce_mean(abs(nu)) t += 1 # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1]) # MODIFIED DISCRIMINATOR SECTION: if self.use_irl: self.discrim_output, log_p_tau, log_q_tau, log_pq = self.discriminator.forward(state_, action, state, a_prob, reuse=True) cost = self.al_loss(log_p=log_p_tau, log_q=log_q_tau, log_pq=log_pq) else: d = self.discriminator.forward(state_, action, reuse=True) cost = self.al_loss(d=d) # END MODIFIED DISCRIMINATOR SECTION # add step cost total_cost += tf.multiply(tf.pow(self.gamma, t), cost) return state, t, total_cost, total_trans_err, env_term_sig def policy_stop_condition(state_, t, cost, trans_err, env_term_sig): cond = tf.logical_not(env_term_sig) cond = tf.logical_and(cond, t < self.env.n_steps_train) cond = tf.logical_and(cond, trans_err < self.env.total_trans_err_allowed) return cond state_0 = tf.slice(states, [0, 0], [1, -1]) loop_outputs = tf.while_loop(policy_stop_condition, policy_loop, [state_0, 0., 0., 0., False]) self.policy.train(objective=loop_outputs[2])
matplotlib.use('Agg') import matplotlib.pyplot as plt from push_env import PushingEnv from forward_model import ForwardModel from inverse_model import InverseModel if __name__ == "__main__": # train inverse model inverse_model = InverseModel() num_epochs = 30 train_losses, valid_losses = inverse_model.train(num_epochs=num_epochs) #train forward model forward_model = ForwardModel() num_epochs = 30 train_losses, valid_losses = forward_model.train(num_epochs=num_epochs) env = PushingEnv(ifRender=False) num_trials = 10 # two pushes, inverse model errors = np.zeros(num_trials) # save one push errors[0] = env.plan_inverse_model_extrapolate( inverse_model, img_save_name="inverse_twopush", seed=0) print("test loss:", errors[0]) # try 10 random seeds for seed in range(1, 10): errors[seed] = env.plan_inverse_model_extrapolate(inverse_model,
def main(): parser = argparse.ArgumentParser() # general & dataset & training settings parser.add_argument('--k_max', type=int, default=5, help='Max reconstruction iterations') parser.add_argument('--save_figs', type = lambda x:bool(strtobool(x)), default=True, help='save pics in reconstruction') parser.add_argument('--img_mode', type=str, default='SimpleCT', help=' image-modality reconstruction: SimpleCT') parser.add_argument('--train_size', type=int, default=4000, help='dataset size') parser.add_argument('--pseudo_inverse_init', type = lambda x:bool(strtobool(x)), default=True, help='initialise with pseudoinverse') parser.add_argument('--brain', type = lambda x:bool(strtobool(x)), default=False, help='test set of brain images') parser.add_argument('--epochs', type=int, default=150, help='number of epochs to train') parser.add_argument('--batch_size', type=int, default=128, help='input batch size for training') parser.add_argument('--initial_lr', type=float, default=1e-3, help='initial_lr') parser.add_argument('--val_batch_size', type=int, default=128, help='input batch size for valing') # forward models setting parser.add_argument('--size', type=int, default=128, help='image size') parser.add_argument('--beam_num_angle', type=int, default=30, help='number of angles / projections') # options parser.add_argument('--no_cuda', type = lambda x:bool(strtobool(x)), default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=222, help='random seed') args = parser.parse_args() layer_utils.set_gpu_mode(True) torch.manual_seed(args.seed) np.random.seed(args.seed) use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') if args.img_mode is not None: forward_model = ForwardModel() half_size = args.size / 2 space = odl.uniform_discr([-half_size, -half_size], [half_size, half_size], [args.size, args.size], dtype='float32') forward_model.space = space geometry = odl.tomo.parallel_beam_geometry(space, num_angles=args.beam_num_angle) forward_model.geometry = geometry operator = odl.tomo.RayTransform(space, geometry) opnorm = odl.power_method_opnorm(operator) forward_model.operator = odl_torch.OperatorModule( (1 / opnorm) * operator ) forward_model.adjoint = odl_torch.OperatorModule(operator.adjoint) pseudoinverse = odl.tomo.fbp_op(operator) pseudoinverse = odl_torch.OperatorModule( pseudoinverse * opnorm ) forward_model.pseudoinverse = pseudoinverse geometry_specs = 'full_view_sparse_' + str(args.beam_num_angle) dataset_name = 'dataset' + '_' + args.img_mode + '_' + str(args.size) \ + '_' + str(args.train_size) + '_' + geometry_specs + '_' \ + 'brain' + '_' + str(args.brain) if args.img_mode == SimpleCT.__name__: img_mode = SimpleCT(forward_model) data_constructor = DatasetConstructor(img_mode, train_size=args.train_size, brain=args.brain, dataset_name=dataset_name) data = data_constructor.data() else: raise NotImplementedError dataset = DataSet(data, img_mode, args.pseudo_inverse_init) optim_parms = {'epochs':args.epochs, 'initial_lr': args.initial_lr, 'batch_size': args.batch_size} from hybrid_model import HybridModel as NeuralLearner # results directory path = os.path.dirname(__file__) dir_path = os.path.join(path, 'results', args.img_mode, 'MFVI', str(args.train_size), geometry_specs, str(args.seed)) if not os.path.isdir(dir_path): os.makedirs(dir_path) # all config print('===========================\n', flush=True) for key, val in vars(args).items(): print('{}: {}'.format(key, val), flush=True) print('===========================\n', flush=True) blocks_history = {'model': [], 'optimizer': []} arch_args = {'arch': {'up': [ [1, 16, 3, 1, 1], [16, 32, 3, 1, 1]], 'low': [ [1, 16, 3, 1, 1], [16, 32, 3, 1, 1]], 'cm': [ [64, 32, 3, 1, 1], [32, 16, 3, 1, 1]] }} # savings training procedures filename = 'train_phase' filepath = os.path.join(dir_path, filename) vis = TrainVisualiser(filepath) start_time = time.time() # looping through architecture-blocs for idx in range(1, args.k_max + 1): print('============== training block number: {} ============= \n'.format(idx), flush=True) train_tensor = dataset.construct(flag='train') val_tensor = dataset.construct(flag='validation') train_loader = DataLoader(train_tensor, batch_size=args.batch_size, shuffle=True) val_loader = DataLoader(val_tensor, batch_size=args.val_batch_size, shuffle=True) model = NeuralLearner(arch_args) model = model.to(device) model_path = os.path.join(dir_path, str(idx) + '.pt') if os.path.exists(model_path): model_loaded = True model.load_state_dict(torch.load(model_path)) print('idx: {} model loaded!\npath to model:\n{}'.format(idx, model_path), flush=True) else: model_loaded = False model.optimise(train_loader, **optim_parms) save_net(model, os.path.join(dir_path, str(idx) + '.pt')) print('idx: {} optimisation finished!'.format(idx), flush=True) start = time.time() info = next_step_update(dataset, train_tensor, model, device, flag='train') end = time.time() print('============= {} {:.4f} ============= \n'.format('training reconstruction', end-start), flush=True) for key in info.keys(): print('{}: {} \n'.format(key, info[key]), flush=True) start = time.time() info = next_step_update(dataset, val_tensor, model, device, flag='validation') end = time.time() print('============= {} {:.4f} ============= \n'.format('validation reconstruction', end-start), flush=True) for key in info.keys(): print('{}: {} \n'.format(key, info[key]), flush=True) vis.update(dataset, flag='validation') blocks_history['model'].append(model) # reconstruction resonstruction_dir_path = os.path.join(dir_path, str(idx)) if model_loaded: resonstruction_dir_path = os.path.join(dir_path, str(idx), 're-loaded') if not os.path.isdir(resonstruction_dir_path): os.makedirs(resonstruction_dir_path) get_stats(dataset, blocks_history, device, resonstruction_dir_path) print('--- training time: %s seconds ---' % (time.time() - start_time), flush=True) vis.generate()
def __init__(self, environment, reweight, ensemble): self.env = environment self.reweight = reweight self.ensemble = ensemble # Create placeholders for all the inputs self.states_ = tf.placeholder("float", shape=(None, self.env.state_size), name='states_') # Batch x State self.states = tf.placeholder("float", shape=(None, self.env.state_size), name='states') # Batch x State self.actions = tf.placeholder("float", shape=(None, self.env.action_size), name='action') # Batch x Action self.label = tf.placeholder("float", shape=(None, 1), name='label') self.gamma = tf.placeholder("float", shape=(), name='gamma') self.temp = tf.placeholder("float", shape=(), name='temperature') self.noise = tf.placeholder("float", shape=(), name='noise_flag') self.do_keep_prob = tf.placeholder("float", shape=(), name='do_keep_prob') self.states_e_ = tf.placeholder("float", shape=(None, self.env.state_size), name='states_e_') self.states_e = tf.placeholder("float", shape=(None, self.env.state_size), name='states_e') self.actions_e = tf.placeholder("float", shape=(None, self.env.action_size), name='action_e') self.ex_wts_ = tf.placeholder("float", shape=(self.ensemble, None), name='ex_wts') # Create MGAIL blocks self.forward_model = ForwardModel(state_size=self.env.state_size, action_size=self.env.action_size, encoding_size=self.env.fm_size, lr=self.env.fm_lr, ensemble=self.ensemble) self.discriminator = Discriminator(in_dim=self.env.state_size + self.env.action_size, out_dim=2, size=self.env.d_size, lr=self.env.d_lr, do_keep_prob=self.do_keep_prob, weight_decay=self.env.weight_decay) self.policy = Policy(in_dim=self.env.state_size, out_dim=self.env.action_size, size=self.env.p_size, lr=self.env.p_lr, do_keep_prob=self.do_keep_prob, n_accum_steps=self.env.policy_accum_steps, weight_decay=self.env.weight_decay) # Create experience buffers self.er_agent = ER(memory_size=self.env.er_agent_size, state_dim=self.env.state_size, action_dim=self.env.action_size, reward_dim=1, # stub connection qpos_dim=self.env.qpos_size, qvel_dim=self.env.qvel_size, batch_size=self.env.batch_size, history_length=1) self.er_expert = common.load_er(fname=os.path.join(self.env.run_dir, self.env.expert_data), batch_size=self.env.batch_size, history_length=1, traj_length=2) self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity # Normalize the inputs states_ = common.normalize(self.states_, self.er_expert.states_mean, self.er_expert.states_std) states = common.normalize(self.states, self.er_expert.states_mean, self.er_expert.states_std) if self.env.continuous_actions: actions = common.normalize(self.actions, self.er_expert.actions_mean, self.er_expert.actions_std) else: actions = self.actions states_e_ = common.normalize(self.states_e_, self.er_expert.states_mean, self.er_expert.states_std) states_e = common.normalize(self.states_e, self.er_expert.states_mean, self.er_expert.states_std) if self.env.continuous_actions: actions_e = common.normalize(self.actions_e, self.er_expert.actions_mean, self.er_expert.actions_std) else: actions_e = self.actions_e # 1. Forward Model if self.reweight: initial_gru_state = np.ones((1, self.forward_model.encoding_size)) self.forward_model.train(x_=[states_, actions, initial_gru_state], y_=states, ex_wts=self.ex_wts_) initial_gru_state_rw = np.ones((1, self.forward_model.encoding_size)) initial_gru_state_val = np.ones((1, self.forward_model.encoding_size)) self.forward_model.reweight(x_=[states_, actions, initial_gru_state_rw], y_=states, x_val_=[states_e_, actions_e, initial_gru_state_val], y_val_=states_e, bsize_a=self.env.batch_size, bsize_b=self.env.batch_size) else: initial_gru_state = np.ones((1, self.forward_model.encoding_size)) self.forward_model.train(x_=[states_, actions, initial_gru_state], y_=states, ex_wts=None) # 1.1 prediction (for development) # self.forward_model.predict(x_=[states_, actions, initial_gru_state], y_=states) # 2. Discriminator labels = tf.concat([1 - self.label, self.label], 1) d = self.discriminator.forward(states, actions) # 2.1 0-1 accuracy correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float")) # 2.2 prediction d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=d, labels=labels) # cost sensitive weighting (weight true=expert, predict=agent mistakes) d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\ tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy) discriminator_loss = tf.reduce_mean(d_loss_weighted) self.discriminator.train(objective=discriminator_loss) # 3. Collect experience mu = self.policy.forward(states) if self.env.continuous_actions: a = common.denormalize(mu, self.er_expert.actions_mean, self.er_expert.actions_std) eta = tf.random_normal(shape=tf.shape(a), stddev=self.env.sigma) self.action_test = tf.squeeze(a + self.noise * eta) else: a = common.gumbel_softmax(logits=mu, temperature=self.temp) self.action_test = tf.argmax(a, dimension=1) # 4.3 AL def policy_loop(state_, t, total_cost, total_trans_err, _): mu = self.policy.forward(state_, reuse=True) if self.env.continuous_actions: eta = self.env.sigma * tf.random_normal(shape=tf.shape(mu)) action = mu + eta else: action = common.gumbel_softmax_sample(logits=mu, temperature=self.temp) # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1]) d = self.discriminator.forward(state_, action, reuse=True) cost = self.al_loss(d) # add step cost total_cost += tf.multiply(tf.pow(self.gamma, t), cost) # get action if self.env.continuous_actions: a_sim = common.denormalize(action, self.er_expert.actions_mean, self.er_expert.actions_std) else: a_sim = tf.argmax(action, dimension=1) # get next state state_env, _, env_term_sig, = self.env.step(a_sim, mode='tensorflow')[:3] state_e = common.normalize(state_env, self.er_expert.states_mean, self.er_expert.states_std) state_e = tf.stop_gradient(state_e) # state_a, _ = self.forward_model.forward([state_, action, initial_gru_state], reuse=True) state_a, _ = self.forward_model.forward(inputs=[state_, action, initial_gru_state], is_training=False, dtype=tf.float32, w_dict=None, ex_wts=None, reuse=True) state, nu = common.re_parametrization(state_e=state_e, state_a=state_a) total_trans_err += tf.reduce_mean(abs(nu)) t += 1 return state, t, total_cost, total_trans_err, env_term_sig def policy_stop_condition(state_, t, cost, trans_err, env_term_sig): cond = tf.logical_not(env_term_sig) cond = tf.logical_and(cond, t < self.env.n_steps_train) cond = tf.logical_and(cond, trans_err < self.env.total_trans_err_allowed) return cond state_0 = tf.slice(states, [0, 0], [1, -1]) loop_outputs = tf.while_loop(policy_stop_condition, policy_loop, [state_0, 0., 0., 0., False]) self.policy.train(objective=loop_outputs[2])