def play(agent1: Agent, agent2: Agent, n_holes=7, n_stones=7, max_game_length=200): game = Mancala(n_holes, n_stones) player = random.choice(['north', 'south']) game_length = 0 finished = False while not finished: if player == 'north': move = agent1.get_move(game, 'north') else: move = agent2.get_move(game, 'south') game.step(player, move) player = game.next_player game_length += 1 if game.game_over or game_length > max_game_length: finished = True if game.winner == 'north': winner = agent1 elif game.winner == 'south': winner = agent2 else: # tie winner = None return winner
def __init__(self, environment: dict, verbose=False): """ Initializer for the simulator helper Args: environment (dict): dictionary housing the obj map (bitmap) and more verbose (bool): flag to print debug prints """ self.environment = environment self.params = create_simulator_params(verbose) self.episode_params = None self.algo_name = "lite" # by default there is no robot (or algorithm) self.obstacle_map = None # keep track of all agents in dictionary with names as the key self.agents = {} # keep track of all robots in dictionary with names as the key self.robots = {} # keep track of all prerecorded humans in a dictionary like the otherwise self.backstage_prerecs = {} self.prerecs = {} # keep a single (important) robot as a value self.robot = None self.sim_states = {} self.wall_clock_time: float = 0 self.sim_t: float = 0.0 self.dt: float = 0 # will be updated in simulator based off dt # metadata of agents self.total_agents: int = 0 self.num_collided_agents: int = 0 self.num_completed_agents: int = 0 self.num_timeout_agents: int = 0 # updated with (non-robot) add_agent # restart agent coloring on every instance of the simulator to be consistent across episodes Agent.restart_coloring()
def reset(self): Agent.reset(self) self.Q = self.model_lambda() self.target_Q = self.model_lambda() self.target_Q.set_weights(self.Q.get_weights()) self.buffer.reset() self.updates_since_target_updated = 0
def __init__(self, action_space, observation_space, params): # Use uper init Agent.__init__(self, action_space, observation_space, params) #Initialize table with all zeros self.Q = np.zeros([observation_space.n, action_space.n]) # Set learning parameters self.episode_count = self.params[0] # Number of episodes
def test_agent(self): _, image = Loader.get_action('cylinder-cube-1', '2019-03-26-09-08-16-480', 'ed-v') if TEST_WITH_GPU: agent = Agent() result = agent.infer([image], SelectionMethod.Max) self.assertEqual(result.safe, True) self.assertEqual(result.method, SelectionMethod.Max)
def spawn_agent(agent_def=None, test_run_name = None): ''' Spawn a new creature and give it an agent. ''' mod_str,cls_str,arg_str = agent_def.split("/") import importlib Agent = getattr(importlib.import_module(mod_str), cls_str) kwargs = eval(arg_str) if len(kwargs) > 0: return Agent(observ_space, action_space,test_run_name=test_run_name, **kwargs) return Agent(observ_space, action_space)
def test_no_transfer_if_no_bet(self): with mock.patch('agents.agent.PredictionMarketAdapter', autospec=True) \ as MockPredictionMarket: mock_prediction_market = MockPredictionMarket.return_value account = '42' agent = Agent(account, logging=False) agent.prediction_history = [None, None, None] agent.collect_reward() mock_prediction_market.transfer_reward.assert_not_called()
def __init__(self, action_space,observation_space,params,discreet=False): # Use uper init Agent.__init__(self,action_space,observation_space,params) self.discreet = discreet if discreet: self.inputN = self.observation_space.n else: self.inputN = self.observation_space.shape[0] self.actionN = self.action_space.n # Set learning parameters self.episode_count = self.params[0] # Number of episodes self.learnRate = self.params[1] # Number of episodes self.dicount = self.params[2] # Time range value for reward self.epsi = self.params[3] # Epsilon for greedy picking self.epsi_decay = self.params[4] self.epsi_min = 0.001 self._timeTot = 200 #define TF graph tf.reset_default_graph() #graph1 = tf.Graph() #with graph1.as_default(): #These lines establish the feed-forward part of the network used to choose actions n_hidden_1 = 64 n_hidden_2 = 32 self.inputs1 = tf.placeholder(shape=[1,self.inputN],dtype=tf.float32) #W1 = tf.Variable(tf.random_uniform([self.inputN,self.actionN],0,0.01)) W1 = tf.Variable(tf.random_normal([self.inputN,n_hidden_1])) W2 = tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])) W3 = tf.Variable(tf.random_normal([n_hidden_2, self.actionN])) layer_1 = tf.nn.relu(tf.matmul(self.inputs1, W1)) layer_2 = tf.nn.relu(tf.matmul(layer_1, W2)) self.Qout = tf.matmul(layer_2, W3) #self.Qout = tf.matmul(self.inputs1,self.W) self.predict = tf.argmax(self.Qout,1) self.time = 0 self.currEpisode = 0 # Current training stage epsiode self.currQs = None # Current prediction for the Q values using current observation #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. self.nextQ = tf.placeholder(shape=[1,self.actionN],dtype=tf.float32) loss = tf.reduce_sum(tf.square(self.nextQ - self.Qout)) trainer = tf.train.AdamOptimizer(learning_rate=self.learnRate) #trainer = tf.train.GradientDescentOptimizer(learning_rate=self.learnRate) self.updateModel = trainer.minimize(loss) init = tf.global_variables_initializer() self.session = tf.Session() self.session.run(init)
def __init__(self, action_space, observation_space, params): # Use uper init Agent.__init__(self, action_space, observation_space, params) #Initialize table with all zeros self.Q = np.zeros([observation_space.n, action_space.n]) # Set learning parameters self.episode_count = self.params[0] # Number of episodes self.lr = self.params[1] #.5 # Learning Rate self.y = self.params[2] # .8 # Discount Factor self.currEpisode = 0 # Current training stage epsiode
def __init__(self, obs): Agent.__init__(self) self.capacity = 5 self.occupation = 0 self.type = "Taxi" self.body.mass = 1000 self.stat = 0 self.clients = [] self.body.fustrum.radius = 200 self.body.vitesseMax = 15 self.observerM = obs self.observer = None self.policy = TaxisPolicy.NONE
def run(env_name='Ant-v2', num_steps=1000): env = gym.make(env_name) agent = Agent(env.observation_space, env.action_space) state = env.reset() reward = None done = False for _ in range(num_steps): env.render() action, _ = agent.act(state, reward, done) state, reward, done, info = env.step(action) print(reward) if done: state = env.reset()
def __init__(self, max_sims=50): # Takes an instance of a Board and optionally some keyword # arguments. Initializes the list of game states and the # statistics tables. Agent.__init__(self) self.total_simulations = 0 self.root_node = None self.if_debug = False self.loglevel = 0 # parameters to change for how deep it goes self.max_sims = max_sims
def interact(env: Env, agent: Agent, start_obs: Arrayable) -> Tuple[array, array, array]: """One step interaction between env and agent. :args env: environment :args agent: agent :args start_obs: initial observation :return: (next observation, reward, terminal?) """ action = agent.step(start_obs) next_obs, reward, done, information = env.step(action) time_limit = information[ 'time_limit'] if 'time_limit' in information else None agent.observe(next_obs, reward, done, time_limit) return next_obs, reward, done
def __init__(self, f): Agent.__init__(self) self.body = BoidsBody() self.type = "StandardAgent" self.famille = f self.body.mass = 80 self.body.fustrum.radius = 100 self.body.vitesseMax = 150.0 self.body.vitesseMin = 20.0 self.velocity = [ random.uniform(-50.0, 50.0), random.uniform(-50.0, 50.0) ] self.avoidanceFactor = 7.5 self.obstacleFactor = 500 self.target = Vector2D(0, 0)
def test_invalid(self): dummy_agent = Agent() with self.assertRaises(AssertionError): Player(dummy_agent, INVALID_PLAYER_ID_1, DUMMY_NAME, DUMMY_COLOUR_NAME, YELLOW) with self.assertRaises(AssertionError): Player(dummy_agent, INVALID_PLAYER_ID_2, DUMMY_NAME, DUMMY_COLOUR_NAME, YELLOW) with self.assertRaises(AssertionError): Player(dummy_agent, PLAYER2_ID, DUMMY_NAME, DUMMY_COLOUR_NAME, INVALID_COLOUR_RGB)
def __init__(self, memory_length=5): """ Empty constructor """ Agent.__init__(self) self.memoryLength = 1 self.color_memory = [''] * memory_length # previous color self.move_memory = [ [] ] * memory_length # previous move location [piece, i, j] self.piece_memory = [ [] ] * memory_length # previously played piece structure self._colors: List[str] = ['_', 'P', 'G', 'B', 'Y', 'O', 'V'] # Piece colors self._to_update = 0 self._update_limit = memory_length - 1
def __init__(self, action_space, observation_space, params, discreet=False): # Use uper init Agent.__init__(self, action_space, observation_space, params) self.discreet = discreet if discreet: self.inputN = self.observation_space.n else: self.inputN = self.observation_space.shape[0] self.actionN = self.action_space.n # Set learning parameters self.episode_count = self.params[0] # Number of episodes self.learnRate = self.params[1] # Number of episodes self.discount = self.params[2] # Time range value for reward self.epsi = self.params[3] # Epsilon for greedy picking self.epsi_decay = self.params[4] self.pretrainEpi = 250 # Number of steps before first train self.batch_size = 200 #Size of training batch self.trainPadding = 5 # Every xth step a training occurs self.tau = 0.01 #Amount to update target network at each step. self.method = self.selectMethod("e-greedy") self.epsi_min = 0.001 self.currEpisode = 0 # Current training stage epsiode self.time = 0 # Current frame within one episode self._timeTot = 200 # Maximal time in one episode self.currQs = None # Current prediction for the Q values using current observation tf.reset_default_graph() self.qNet = Q_Network([[self.inputN, 128, self.actionN], self.learnRate]) self.targetQNet = Q_Network([[self.inputN, 128, self.actionN], self.learnRate]) self.myBuffer = ExperienceBuffer() init = tf.global_variables_initializer() trainables = tf.trainable_variables() self.targetOps = Q_Network.updateTargetGraph(trainables, self.tau) self.session = tf.Session() self.session.run(init)
def train(nb_steps: int, env: Env, agent: Agent, start_obs: Arrayable): """Trains for one epoch. :args nb_steps: number of interaction steps :args env: environment :args agent: interacting agent :start_obs: starting observation :return: final observation """ agent.train() agent.reset() obs = start_obs for _ in range(nb_steps): # interact obs, _, _ = interact(env, agent, obs) return obs
def __init__(self): """ Initializes random DQN model """ Agent.__init__(self) # Initialize DQN dqn_input_dim = len(SquareStackerGame().get_state_vector()) dqn_output_dim = len(move_to_vector([0, 0, 0])) self._dqn = Sequential([ Dense(128, input_dim=dqn_input_dim), Activation('relu'), Dense(128), Activation('relu'), Dense(dqn_output_dim), ]) self._dqn.compile(optimizer=Adam(), loss='mse', metrics=['accuracy'])
def test_valid(self): dummy_agent = Agent() subject = Player(dummy_agent, PLAYER1_ID, DUMMY_NAME, DUMMY_COLOUR_NAME, YELLOW) self.assertEqual(subject.agent, dummy_agent) self.assertEqual(subject.piece_id, PLAYER1_ID) self.assertEqual(subject.name, DUMMY_NAME) self.assertEqual(subject.colour_name, DUMMY_COLOUR_NAME) self.assertEqual(subject.colour_rgb, YELLOW)
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True): Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', self.replicated_device, self.worker_device) self.networks.append(self.main_network) self.q_values = Signal("Q") self.signals.append(self.q_values) self.reset_game(do_not_reset_env=True)
def __init__(self): Agent.__init__(self) self.timeout = 600 self.destination = Destination(0, 0) self.onboard = -1 self.type = "Client" self.body.mass = 80 self.body.vitesseMax = 1 self.body.fustrum.radius = 100 self.policy = ClientsPolicy.NONE self.observer = ClientObserver(self.id, time.time(), self.body.location) self.cohesionFactor = 0.03 self.velocity = [ random.uniform(-50.0, 50.0), random.uniform(-50.0, 50.0) ] self.allignFactor = 0.045
def main(game_name, game_length): #Game description reward_mode = 'base' reward_scale = 1.0 elite_prob = 0 env = Env( game_name, game_length, { 'reward_mode': reward_mode, 'reward_scale': reward_scale, 'elite_prob': elite_prob }) #Network latent_shape = (512, ) dropout = 0 lr = .0001 gen = Generator(latent_shape, env, 'nearest', dropout, lr) #Agent num_processes = 1 experiment = "Experiments" lr = .00025 model = 'base' dropout = .3 reconstruct = None r_weight = .05 Agent.num_steps = 5 Agent.entropy_coef = .01 Agent.value_loss_coef = .1 agent = Agent(env, num_processes, experiment, 0, lr, model, dropout, reconstruct, r_weight) #Training gen_updates = 1e4 gen_batch = 32 gen_batches = 1 diversity_batches = 0 rl_batch = 1e4 pretrain = 0 elite_persist = False elite_mode = 'mean' load_version = 0 notes = '' agent.writer.add_hparams( { 'Experiment': experiment, 'RL_LR': lr, 'Minibatch': gen_batch, 'RL_Steps': rl_batch, 'Notes': notes }, {}) t = Trainer(gen, agent, experiment, load_version, elite_mode, elite_persist) t.loss = lambda x, y: x.mean().pow(2) t.train(gen_updates, gen_batch, gen_batches, diversity_batches, rl_batch, pretrain)
def main(game_name, game_length): #Game description reward_mode = 'time' reward_scale = 1.0 elite_prob = .5 env = Env( game_name, game_length, { 'reward_mode': reward_mode, 'reward_scale': reward_scale, 'elite_prob': elite_prob }) #Network latent_shape = (512, ) dropout = .2 lr = .0001 gen = Generator(latent_shape, env, 'pixel', dropout, lr) #Agent num_processes = 16 experiment = "Experiment_Paper" lr = .00025 model = 'resnet' dropout = 0 reconstruct = gen r_weight = .05 Agent.num_steps = 5 Agent.entropy_coef = .01 Agent.value_loss_coef = .1 agent = Agent(env, num_processes, experiment, 0, lr, model, dropout, reconstruct, r_weight) #Training gen_updates = 100 gen_batch = 128 gen_batches = 10 diversity_batches = 90 rl_batch = 1e6 pretrain = 2e7 elite_persist = True elite_mode = 'max' load_version = 0 notes = 'Configured to match paper results' agent.writer.add_hparams( { 'Experiment': experiment, 'RL_LR': lr, 'Minibatch': gen_batch, 'RL_Steps': rl_batch, 'Notes': notes }, {}) t = Trainer(gen, agent, experiment, load_version, elite_mode, elite_persist) t.train(gen_updates, gen_batch, gen_batches, diversity_batches, rl_batch, pretrain)
def simulate(self): """ A function that simulates an entire episode. The gen_agents are updated with simultaneous threads running their update() functions and updating the robot with commands from the external joystick process. """ # initialize pre-simulation metadata self.init_sim_data() # keep track of wall-time in the simulator start_time = time.time() # get initial state current_state = self.save_state() # initialize robot update thread r_t = self.init_robot_listener_thread(current_state) # start iteration iteration = 0 self.print_sim_progress(iteration) # run simulation while self.sim_t <= self.episode_params.max_time and self.loop_condition( ): wall_t = time.time() # update the time for all agents Agent.set_sim_t(self.sim_t) # initiate thread operations self.pedestrians_update(current_state) if self.robot is not None: # calls a single iteration of the robot update self.robot.update() # update simulator time self.sim_t += self.dt # capture time after all the gen_agents have updated # Takes screenshot of the new simulation state current_state = self.save_state(wall_t - start_time) if self.robot: self.robot.update_world(current_state) # update iteration count iteration += 1 # print simulation progress self.print_sim_progress(iteration) # synchronize time with real-world if running in asynchronous mode self.synchronize(wall_t) # finish the simulate self.conclude_simulation(start_time, iteration, r_t)
def init_sim_data(self, verbose: bool = True): self.total_agents = len(self.agents) + len(self.backstage_prerecs) # Create pre-simulation metadata if verbose: print("Running simulation on", self.total_agents, "agents") # scale the simulator time self.dt = self.params.delta_t_scale * self.params.dt # update the baseline agents' simulation refresh rate Agent.set_sim_dt(self.dt) Agent.set_sim_t(self.sim_t) # add the first (when t=0) agents to the self.prerecs dict self.init_prerec_agent_threads(current_state=None) # save initial state before the simulator is spawned self.sim_t = 0.0 if self.dt < self.params.dt: print( "%sSimulation dt is too small; either lower the gen_agents' dt's" % color_red, self.params.dt, "or increase simulation delta_t%s" % color_reset) exit(1)
def __init__(self): Agent.__init__(self) self.body = BoidsBody() self.collisionDVel = 1 self.type = "Boid" self.famille = 1 self.body.mass = 80 self.body.fustrum.radius = 100 self.body.vitesseMax = 150.0 self.body.vitesseMin = 20.0 self.repultion = 150 self.cohesionFactor = 0.03 self.collisionDistance = 10 self.velocity = [ random.uniform(-50.0, 50.0), random.uniform(-50.0, 50.0) ] self.allignFactor = 0.045 self.avoidanceFactor = 7.5 self.attractorFactor = 0.35 self.obstacleFactor = 500
def __init__(self, action_space, observation_space, params): # Use uper init Agent.__init__(self, action_space, observation_space, params) # Set learning parameters self.episode_count = params[0] # Number of episodes self.lr = params[1] #.5 # Learning Rate self.y = params[2] # .8 # Discount Factor self.binsize = params[ 3] # Should be uneven to distinguish -epsi and epsi self.currEpisode = 0 # Current training stage epsiode #Initialize table with all zeros self.Q = np.zeros([ np.power(self.binsize, observation_space.shape[0]), action_space.n ]) # Determine Bins self.low = [-0.5, -2, -0.25, -2] #self.observation_space.low self.high = [0.5, 2, 0.25, 2] # self.observation_space.high self.createBins()
def init_control_pipeline(self): # NOTE: this is like an init() run *after* obtaining episode metadata # robot start and goal to satisfy the old Agent.planner self.start_config = generate_config_from_pos_3(self.get_robot_start()) self.goal_config = generate_config_from_pos_3(self.get_robot_goal()) # rest of the 'Agent' params used for the joystick planner self.agent_params = create_agent_params(with_planner=True, with_obstacle_map=True) # update generic 'Agent params' with joystick-specific params self.agent_params.episode_horizon_s = self.joystick_params.episode_horizon_s self.agent_params.control_horizon_s = self.joystick_params.control_horizon_s # init obstacle map self.obstacle_map = self.init_obstacle_map() self.obj_fn = Agent._init_obj_fn(self, params=self.agent_params) psc_obj = Agent._init_psc_objective(params=self.agent_params) self.obj_fn.add_objective(psc_obj) # Initialize Fast-Marching-Method map for agent's pathfinding Agent._init_fmm_map(self, params=self.agent_params) # Initialize system dynamics and planner fields self.planner = Agent._init_planner(self, params=self.agent_params) self.vehicle_data = self.planner.empty_data_dict() self.system_dynamics = Agent._init_system_dynamics( self, params=self.agent_params) # init robot current config from the starting position self.robot_current = self.current_ep.get_robot_start().copy() # init a list of commands that will be sent to the robot self.commands = None
class TestAgentController(TestCase): controller = LastChanceAgentController(Agent()) def test_is_betting_period(self): self.assertTrue(only_once_during_first_half_day(self.controller.is_betting_period)) def test_is_ranking_period(self): self.assertTrue(only_once_during_first_half_day(self.controller.is_ranking_period)) def test_is_collecting_period(self): self.assertTrue(only_once_during_second_half_day(self.controller.is_collecting_period)) def test_bet_before_rank(self): self.assertTrue(first_this_then_that(self.controller.is_betting_period, self.controller.is_ranking_period))
def __init__(self, *args, **kwargs): Agent.__init__(self, *args, **kwargs) if self.knowledge is None: self.knowledge = set() self.knowledge = self._convert_to_set(self.knowledge) assert isinstance(self.knowledge, set)