def post_connect(self, agent_id, entities, config): self.entity_id = agent_id self.world_model = WorldModel() self.world_model.add_entities(entities) #todo: check whether we need to merge agent config and the config coming from kernel self.config = config print('post_connect agent_id:' + str(agent_id.get_value()))
def run_simulated_mission(model, display=None, use_delays=False): print("Simulated mission running.") world_model = WorldModel(BLUEPRINT, CONFIG_FILE, simulated=True) ticks_left = 5 * MAX_EPISODE_TIME total_reward = 0 current_r = 0 while (ticks_left > 0 and world_model.is_mission_running()): ticks_left -= 1 current_r = world_model.reward() action = model.act(current_r, world_model.get_observation()) if display is not None: display.update(world_model) total_reward += current_r world_model.simulate(action) if use_delays: print(action) time.sleep(ACTION_DELAY) # Collect last reward, and give to model, then end the mission current_r = world_model.reward() model.act(current_r, world_model.get_observation()) total_reward += current_r model.mission_ended() print("Simulated mission ended") return total_reward, (MAX_EPISODE_TIME - (ticks_left / 5))
def connect(self, host, port, teamname, version='15.5'): """ Gives us a connection to the server as one player on a team. This immediately connects the agent to the server and starts receiving and parsing the information it sends. """ # if already connected, raise an error since user may have wanted to # connect again to a different server. if self.__connected: msg = "Cannot connect while already connected, disconnect first." raise sp_exceptions.AgentConnectionStateError(msg) # the pipe through which all of our communication takes place self.__sock = sock.Socket(host, port) # our models of the world and our body self.wm = WorldModel(handler.ActionHandler(self.__sock, self.teamname)) # set the team name of the world model to the given name self.wm.teamname = teamname # handles all messages received from the server self.msg_handler = handler.MessageHandler(self.wm, self.teamname) self.action_handler = handler.ActionHandler(port, self.teamname) # set up our threaded message receiving system self.__parsing = True # tell thread that we're currently running self.__msg_thread = threading.Thread(target=self.__message_loop, name="message_loop") self.__msg_thread.daemon = True # dies when parent thread dies # start processing received messages. this will catch the initial server # response and all subsequent communication. self.__msg_thread.start() # send the init message and allow the message handler to handle further # responses. init_address = self.__sock.address init_msg = "( init %s ( version %s ) )" self.__sock.send(init_msg % (teamname, version)) print(init_msg) # wait until the socket receives a response from the server and gets its # assigned port. while self.__sock.address == init_address: time.sleep(0.0001) # create our thinking thread. this will perform the actions necessary # to play a game of robo-soccer. self.__thinking = False self.__think_thread = threading.Thread(target=self.__think_loop, name="think_loop") self.__think_thread.daemon = True # set connected state. done last to prevent state inconsistency if # something goes wrong beforehand. self.__connected = True
async def setup(self): print( f"--- arm_agent: PeriodicSenderAgent started at {datetime.datetime.now().time()}" ) init_world_model = WorldModel() start_at = datetime.datetime.now() + datetime.timedelta( seconds=init_world_model.current_world_model. init_delay_seconds["arm"]) bdi_behaviour = self.BDIBehaviour( period=init_world_model.current_world_model. real_time_clock_period_seconds["arm"], start_at=start_at) self.add_behaviour(bdi_behaviour)
async def on_start(self): self.terminate = False self.SUCCESS = False self.verbose = False # Initialization self.beliefs = WorldModel() # B := B0; Initial Beliefs self.goals = self.beliefs.current_world_model.goals self.intentions = self.beliefs.current_world_model.goals # I := I0; Initial Intentions self.htn_planner = HierarchicalTaskNetworkPlanner(self.beliefs) self.perception = Perception(self.beliefs) self.coordination = Coordination(self.beliefs) self.monitoring = Monitoring() self.what, self.why, self.how_well, self.what_else, self.why_failed = "", "", "", "", "" self.plans = [] self.selected_plan = [] self.percept = {} self.action = "" self.start_time = datetime.datetime.now()
def run_simulated_mission(model, mission, cfg, demo=False): print("Simulated mission running.") world_model = WorldModel(mission.blueprint, cfg, simulated=True, agent_pos=mission.start_position) ticks_left = 5 * mission.max_episode_time total_reward = 0 current_r = 0 use_delays = mission.action_delay > 0 while (ticks_left > 0 and world_model.is_mission_running()): ticks_left -= 1 current_r = world_model.reward() if demo: action = model.demo_act(world_model.get_observation()) else: action = model.act(current_r, world_model.get_observation()) if mission.display is not None: mission.display.update(world_model) total_reward += current_r world_model.simulate(action) if use_delays: print(action) time.sleep(mission.action_delay) # Collect last reward, and give to model, then end the mission if mission.display is not None: mission.display.update(world_model) current_r = world_model.reward() if not demo: model.act(current_r, world_model.get_observation()) total_reward += current_r model.mission_ended() print("Simulated mission ended") return MissionStats(reward=total_reward, length=(mission.max_episode_time - (ticks_left / 5)))
async def on_start(self): self.terminate = False self.SUCCESS = False self.verbose = False # Initialization self.htn_planner = HierarchicalTaskNetworkPlanner() self.goal = [('transfer_target_object_to_container', 'arm', 'target_object', 'table', 'container')] self.intentions = self.goal # I := I0; Initial Intentions self.beliefs = WorldModel() # B := B0; Initial Beliefs self.monitoring = Monitoring() self.perception = Perception() self.coordination = Coordination() # Disable all 3 coordination switches for testing self.coordination.control.send_requests = True self.coordination.control.center_init = False self.coordination.control.detect_last_position = True self.what, self.why, self.how_well, self.what_else, self.why_failed = "", "", "", "", "" self.plans = [] self.start_time = datetime.datetime.now()
def run_mission(model, display=None): # Create default Malmo objects: my_mission = MalmoPython.MissionSpec(MISSION_XML, True) my_mission_record = MalmoPython.MissionRecordSpec() world_model = WorldModel(BLUEPRINT, CONFIG_FILE, simulated=False) # Attempt to start a mission: for retry in range(MAX_RETRIES): try: AGENT_HOST.startMission(my_mission, my_mission_record) break except RuntimeError as e: if retry == MAX_RETRIES - 1: print("Error starting mission:", e) exit(1) else: time.sleep(2**retry) # Loop until mission starts: print("Waiting for the mission to start ", end=' ') world_state = AGENT_HOST.getWorldState() while not world_state.has_mission_begun: print(".", end="") time.sleep(0.1) world_state = AGENT_HOST.getWorldState() for error in world_state.errors: print("Error:", error.text) print("\nMission running.") total_reward = 0 current_r = 0 start = time.time() # Loop until mission ends while (world_state.is_mission_running and world_model.is_mission_running()): world_state = AGENT_HOST.getWorldState() for error in world_state.errors: print("Error:", error.text) current_r += sum(r.getValue() for r in world_state.rewards) if len(world_state.observations) > 0: raw_obs = json.loads(world_state.observations[-1].text) world_model.update(raw_obs) current_r += world_model.reward() action = model.act(current_r, world_model.get_observation()) if display is not None: display.update(world_model) total_reward += current_r current_r = 0 if world_model.mission_complete( ) or not world_model.agent_in_arena(): AGENT_HOST.sendCommand('quit') elif world_state.is_mission_running: AGENT_HOST.sendCommand(action) time.sleep(ACTION_DELAY) end = time.time() model.mission_ended() print() print("Mission ended") return total_reward, end - start
goal, verbose=self.verbose, all_plans=True, sort_asc=True) else: return "" if __name__ == '__main__': htn_planner = HierarchicalTaskNetworkPlanner() end_goal = [('transfer_target_object_to_container', 'arm', 'target_object', 'table', 'container')] intentions = end_goal # I := I0; Initial Intentions from world_model import WorldModel beliefs = WorldModel() # B := B0; Initial Beliefs print() beliefs.current_world_model.xyz["target_object"] = [-10, -10, 0] htn_plans = htn_planner.get_plans( beliefs.current_world_model, intentions) # π := plan(B, I); MEANS_END REASONING if not htn_plans: print("-- No valid plan. Failure_reason: {}".format( htn_planner.failure_reason)) else: beliefs.current_world_model.plans = htn_plans print("== Best current_world_model.plan: ", beliefs.current_world_model.plans[0]) print()
elif action == ('close_hand', ): action_successful = self.control.close_hand( world_model.size["object_side_length"]) elif action == ('move_arm_above', 'container'): action_successful = self.control.move_arm_above_xyz( world_model.xyz["container"], world_model.location["servo_values"], 14) return action_successful if __name__ == '__main__': # Sequence for testing from world_model import WorldModel current_world_model = WorldModel() coordination = Coordination(current_world_model) coordination.control.control_world_model["send_requests"] = False coordination.control.control_world_model["center_init"] = False coordination.control.control_world_model["detect_last_position"] = False coordination.execute_action(('initialize', 'arm'), current_world_model.current_world_model) coordination.execute_action(('open_hand', ), current_world_model.current_world_model) coordination.execute_action(('move_arm_above', 'target_object'), current_world_model.current_world_model) coordination.execute_action(('move_arm', 'target_object'), current_world_model.current_world_model) coordination.execute_action(('close_hand', ), current_world_model.current_world_model) coordination.execute_action(('move_arm_up', 'target_object'),
def run_malmo_mission(model, mission, mission_xml, cfg, agent_host, max_retries=5, demo=False): # Create default Malmo objects: my_mission = MalmoPython.MissionSpec(mission_xml, True) my_mission_record = MalmoPython.MissionRecordSpec() world_model = WorldModel(mission.blueprint, cfg, simulated=False) # Attempt to start a mission: for retry in range(max_retries): try: agent_host.startMission(my_mission, my_mission_record) break except RuntimeError as e: if retry == max_retries - 1: print("Error starting mission:", e) exit(1) else: time.sleep(2**retry) # Loop until mission starts: print("Waiting for the mission to start ", end=' ') world_state = agent_host.getWorldState() while not world_state.has_mission_begun: print(".", end="") time.sleep(0.1) world_state = agent_host.getWorldState() time.sleep(1) for error in world_state.errors: print("Error:", error.text) print("\nMission running.") total_reward = 0 current_r = 0 start = time.time() # Loop until mission ends while (world_state.is_mission_running and world_model.is_mission_running()): world_state = agent_host.getWorldState() for error in world_state.errors: print("Error:", error.text) current_r += sum(r.getValue() for r in world_state.rewards) if len(world_state.observations) > 0: raw_obs = json.loads(world_state.observations[-1].text) world_model.update(raw_obs) current_r += world_model.reward() if demo: action = model.demo_act(world_model.get_observation()) else: action = model.act(current_r, world_model.get_observation()) if mission.display is not None: mission.display.update(world_model) total_reward += current_r current_r = 0 if world_model.mission_complete( ) or not world_model.agent_in_arena(): agent_host.sendCommand('quit') elif world_state.is_mission_running: agent_host.sendCommand(action) if demo: print(action) time.sleep(mission.action_delay) end = time.time() model.mission_ended() print() print("Mission ended") return MissionStats(reward=total_reward, length=end - start)
input_world_model.current_world_model.distance = percept[ "distance"] elif key == "location": for key2 in percept["location"]: if key2 == "servo_values": input_world_model.current_world_model.location["servo_values"] = \ percept["location"]["servo_values"] return input_world_model if __name__ == '__main__': # Sequence for testing from world_model import WorldModel world_model = WorldModel() perception = Perception(world_model) perception.perception_world_model["write_video"] = False import time from world_model import WorldModel beliefs = WorldModel() time.sleep(0.1) current_percept = {"xyz": {'target_object': [15, 15, 0]}} beliefs.update_tick() beliefs = perception.belief_revision(beliefs, current_percept) time.sleep(0.1) current_percept = {"xyz": {'target_object': [14, 16, 0]}} beliefs.update_tick() beliefs = perception.belief_revision(beliefs, current_percept)
NUM_HALLUCINATIONS = 200 NUM_ROLLOUTS = 10 MAX_ENV_STEPS = 100_000 env = gym.make('Breakout-v0') writer = SummaryWriter() resize = torchvision.transforms.Resize((42, 32)) agent = CSPN_A2C(breakout_test.LATENT_DIM, breakout_test.ACTION_DIM, breakout_test.PolicyNetwork(), breakout_test.ValueNetwork(), continuous=breakout_test.CONTINOUS) autoencoder = breakout_test.AutoEncoder(42, 32) world_model_cspn = breakout_test.ForwardModelCSPN() world_model = WorldModel(breakout_test.LATENT_DIM, 1, autoencoder.encode, autoencoder.decode, autoencoder, world_model_cspn) for epoch in range(NUM_EPOCHS): print("AT EPOCH:", epoch) starting_obs = [] buffer = ExperienceReplayBuffer() for rollout_idx in range(NUM_ROLLOUTS): print("EPISODE #", rollout_idx) obs = env.reset() obs = np.rollaxis(obs, 2, 0) obs = resize(torch.Tensor(obs)) obs = obs.unsqueeze(0) / 255.0 episode = [] starting_obs.append(obs)
elif key == "location": # TODO: if xyz of object within limits -> on table else -> not on table for key2 in percept["location"]: if key2 == "target_object": world_model.current_world_model.location["target_object"] = percept["location"][ "target_object"] elif key == "initialized": # TODO: if servos at 1500-ish -> initialized world_model.current_world_model.initialized = percept["initialized"] return world_model if __name__ == '__main__': # Sequence for testing from world_model import WorldModel beliefs = WorldModel() monitoring = Monitoring() time.sleep(0.1) current_percept = {"distance": {'distance_to_gripper': 8.2}} beliefs.update_tick() beliefs = monitoring.fire_events(beliefs, current_percept) time.sleep(0.1) current_percept = {"distance": {'distance_to_gripper': 5.2}} beliefs.update_tick() beliefs = monitoring.fire_events(beliefs, current_percept) time.sleep(0.1) current_percept = {"distance": {'distance_to_gripper': 2.2}} beliefs.update_tick()
def main(): with tf.Session() as sess: output_filename = "log.csv" model_learning_rate = 1e-2 model_hidden_size = 256 model_training_episodes_per_batch = 5 model_training_batches_per_training = 100 policy_learning_rate = 1e-2 policy_hidden_size = 8 policy_training_episodes_per_batch = 5 policy_training_batches_per_training = 10 policy_evaluation_episodes = 20 evaluation_episodes = 10 num_rounds = 100 env = gym.make('CartPole-v0') state_space_size = env.observation_space.shape[0] action_space_size = env.action_space.n world_model = WorldModel(state_space_size, action_space_size, model_hidden_size) policy = Policy(sess, state_space_size, action_space_size, policy_hidden_size) start_state_buffer = CircularBuffer(20) state_initializer = lambda: start_state_buffer.get() sess.run(tf.global_variables_initializer()) def make_episode_batch(env, policy, batch_size, max_length=None): """ Uses a black-box policy to generate an epsiode for training the model. """ states_in = [] states_out = [] actions = [] rewards = [] dones = [] for b in range(batch_size): states_in_this_ep = [] states_out_this_ep = [] actions_this_ep = [] rewards_this_ep = [] dones_this_ep = [] s = env.reset() done = False length = 0 while (not done) and (max_length is None or length < max_length): length += 1 a = policy(s) s1, reward, done, _ = env.step(a) states_in_this_ep.append(s) states_out_this_ep.append(s1) actions_this_ep.append(a) rewards_this_ep.append([reward]) dones_this_ep.append([1.0 if done else 0.0]) s = s1 states_in_this_ep = np.stack(states_in_this_ep, axis=0) states_out_this_ep = np.stack(states_out_this_ep, axis=0) actions_this_ep = np.stack(actions_this_ep, axis=0) rewards_this_ep = np.stack(rewards_this_ep, axis=0) dones_this_ep = np.stack(dones_this_ep, axis=0) states_in.append(states_in_this_ep) states_out.append(states_out_this_ep) actions.append(actions_this_ep) rewards.append(rewards_this_ep) dones.append(dones_this_ep) return states_in, states_out, actions, rewards, dones output_logfile = open(output_filename, 'wt') output_logfile.write("epoch,model_state_mse,model_reward_mse,model_done_ce,policy_model_reward,policy_env_reward\n") for r in range(1, num_rounds+1): # Train the world model on episodes generated using the policy model_loss = [0.0, 0.0, 0.0, 0.0] for b in range(model_training_batches_per_training): states_in, states_out, actions, rewards, dones = make_episode_batch(env, policy.apply, model_training_episodes_per_batch) for start_state in [x[0] for x in states_in]: start_state_buffer.put(start_state) this_loss = world_model.train_on_episodes(np.concatenate(states_in, axis=0), np.concatenate(actions, axis=0), np.concatenate(states_out, axis=0), np.concatenate(rewards, axis=0), np.concatenate(dones, axis=0), learning_rate=1e-4, sess=sess) model_loss = [x + this_loss[i] for (i, x) in enumerate(model_loss)] model_loss = [x / model_training_batches_per_training for x in model_loss] print("Model MSE: {}".format(model_loss)) # Train the policy on the world model total_reward = 0.0 for b in range(policy_training_batches_per_training): for ep in range(policy_training_episodes_per_batch): total_reward += policy.run_episode_and_accumulate_gradients(world_model.env_analogue(sess, state_initializer=state_initializer)) policy.apply_accumulated_gradients(policy_learning_rate) total_reward /= (policy_training_batches_per_training * policy_training_episodes_per_batch) print("Policy reward in model: {}".format(total_reward)) # Evaluate the policy on the real environment evaluation_reward = 0.0 for ep in range(policy_evaluation_episodes): evaluation_reward += policy.run_episode_and_accumulate_gradients(env) policy.clear_grad_buffers() evaluation_reward /= policy_evaluation_episodes print("Policy reward in real env: {}".format(evaluation_reward)) output_logfile.write("{},{},{},{},{},{}\n".format(r, model_loss[1], model_loss[2], model_loss[3], total_reward, evaluation_reward)) output_logfile.flush() output_logfile.close()