def construct_curriculum_env(curr_id, tensor_state=True): one_eight_config = {'agent_speed_range': default_speed_range, 'width': default_width, 'lanes': default_lanes, 'finish_position': Point(43, 8) } quarter_config = {'agent_speed_range': default_speed_range, 'width': default_width, 'lanes': default_lanes, 'finish_position': Point(36, 6) } half_config = {'agent_speed_range': default_speed_range, 'width': default_width, 'lanes': default_lanes, 'finish_position': Point(24, 4) } full_config = {'agent_speed_range': default_speed_range, 'width': default_width, 'lanes': default_lanes, 'finish_position': Point(0, 0), 'agent_pos_init': Point(3, 0) } if curr_id == 0: config = one_eight_config elif curr_id == 1: config = quarter_config elif curr_id == 2: config = half_config elif curr_id == 3: config = full_config else: raise ValueError("No curriculum of ID: {}".format(curr_id)) if tensor_state: config['observation_type'] = 'tensor' return gym.make('GridDriving-v0', **config)
def construct_training_env(): config = { 'observation_type': 'tensor', 'agent_speed_range': [-3, -1], 'finish_position': Point(0, 0), 'random_seed': 15, 'stochasticity': 1., 'lanes': [ LaneSpec(cars=7, speed_range=[-2, -1]), LaneSpec(cars=8, speed_range=[-2, -1]), LaneSpec(cars=6, speed_range=[-1, -1]), LaneSpec(cars=6, speed_range=[-3, -1]), LaneSpec(cars=7, speed_range=[-2, -1]), LaneSpec(cars=8, speed_range=[-2, -1]), LaneSpec(cars=6, speed_range=[-3, -2]), LaneSpec(cars=7, speed_range=[-1, -1]), LaneSpec(cars=6, speed_range=[-2, -1]), LaneSpec(cars=8, speed_range=[-2, -2]) ], 'width': 50, 'tensor_state': True, 'flicker_rate': 0., 'mask': None } return gym.make('GridDriving-v0', **config)
def construct_task2_env(tensor_state=True): large_config = {'agent_speed_range': [-3, -1], 'width': 50, 'lanes': default_lanes } small_config = {'observation_type': 'tensor', 'agent_speed_range': [-2, -1], 'stochasticity': 0.0, 'width': 10, 'lanes': [ LaneSpec(cars=3, speed_range=[-2, -1]), LaneSpec(cars=4, speed_range=[-2, -1]), LaneSpec(cars=2, speed_range=[-1, -1]), LaneSpec(cars=2, speed_range=[-3, -1]) ]} medium_config = {'observation_type': 'tensor', 'agent_speed_range': [-3, -1], 'width': 15, 'lanes': [ LaneSpec(cars=3, speed_range=[-2, -1]), LaneSpec(cars=4, speed_range=[-2, -1]), LaneSpec(cars=2, speed_range=[-1, -1]), LaneSpec(cars=2, speed_range=[-3, -1]), LaneSpec(cars=3, speed_range=[-2, -1]), LaneSpec(cars=4, speed_range=[-2, -1]) ]} medium_large_config = {'agent_speed_range': [-3, -1], 'width': 40, 'lanes': [LaneSpec(cars=6, speed_range=[-2, -1]), LaneSpec(cars=7, speed_range=[-2, -1]), LaneSpec(cars=5, speed_range=[-1, -1]), LaneSpec(cars=5, speed_range=[-3, -1]), LaneSpec(cars=6, speed_range=[-2, -1]), LaneSpec(cars=7, speed_range=[-2, -1]), ] } curri_large_config = {'agent_speed_range': [-3, -1], 'width': 50, 'lanes': default_lanes, 'finish_position': Point(43, 6) } config = large_config if tensor_state: config['observation_type'] = 'tensor' return gym.make('GridDriving-v0', **config)
'seed': 25, 'iters': 300 }] test_case_number = 0 #Change the index for a different test case LANES = test_config[test_case_number]['lanes'] WIDTH = test_config[test_case_number]['width'] RANDOM_SEED = test_config[test_case_number]['seed'] numiters = test_config[test_case_number]['iters'] stochasticity = 1. env = gym.make( 'GridDriving-v0', lanes=LANES, width=WIDTH, agent_speed_range=(-3, -1), finish_position=Point(0, 0), #agent_ pos_init=Point(4,2), stochasticity=stochasticity, tensor_state=False, flicker_rate=0., mask=None, random_seed=RANDOM_SEED) actions = env.actions env.render() done = False mcts = MonteCarloTreeSearch(env=env, numiters=numiters, explorationParam=1., random_seed=RANDOM_SEED) while not env.done: state = GridWorldState(env.state, is_done=done)
def train(model, env, train_type=0, model_class=ConvDQN): # Initialize model and target network f = open('record.txt', 'a') if not model: model = model_class(env.observation_space.shape, env.action_space.n).to(device) target = model_class(env.observation_space.shape, env.action_space.n).to(device) target.load_state_dict(model.state_dict()) target.eval() # Initialize replay buffer memory = ReplayBuffer(buffer_limit) # Initialize rewards, losses, and optimizer rewards = [] losses = [] optimizer = optim.Adam(model.parameters(), lr=learning_rate) for episode in range(max_episodes): if not use_epsilon: epsilon = compute_epsilon(episode) else: epsilon = use_epsilon if train_type == 0: env.agent_pos_init = Point( random.randint(1, 5 * curriculum_num - 1), random.randint(0, curriculum_num - 1)) elif train_type == 1: env.agent_pos_init = Point( random.randint(5 * curriculum_num - 6, 5 * curriculum_num - 1), random.randint(curriculum_num - 2, curriculum_num - 1)) elif train_type == 2: env.agent_pos_init = Point(5 * curriculum_num - 1, curriculum_num - 1) elif train_type == 3: pass state = env.reset() episode_rewards = 0.0 for t in range(t_max): # Model takes action action = model.act(state, epsilon) # Apply the action to the environment next_state, reward, done, info = env.step(action) if env.world.agent_state == AgentState.crashed: reward = -2 # Save transition to replay buffer memory.push( Transition(state, [action], [reward], next_state, [done])) state = next_state episode_rewards += reward if done or train_type == 4 and t > 40: break rewards.append(episode_rewards) # Train the model if memory is sufficient if len(memory) > min_buffer: # if np.mean(rewards[print_interval:]) < -60: # print('Bad initialization. Please restart the training.') # exit() for i in range(train_steps): loss = optimize(model, target, memory, optimizer) losses.append(loss.item()) # Update target network every once in a while if episode % target_update == 0: target.load_state_dict(model.state_dict()) if episode % print_interval == 0 and episode > 0: print( "[Curriculum {} Type {} Episode {}]\tavg rewards : {:.3f},\tavg loss: : {:.6f},\tbuffer size : {},\tepsilon : {:.1f}%" .format(curriculum_num, train_type, episode, np.mean(rewards[print_interval:]), np.mean(losses[print_interval * 10:]), len(memory), epsilon * 100)) f.write( "[Curriculum {} Type {} Episode {}]\tavg rewards : {:.3f},\tavg loss: : {:.6f},\tbuffer size : {},\tepsilon : {:.1f}% \n" .format(curriculum_num, train_type, episode, np.mean(rewards[print_interval:]), np.mean(losses[print_interval * 10:]), len(memory), epsilon * 100)) f.flush() if episode % 1000 == 0: save_model_with_path( model, './model_' + str(curriculum_num) + '_' + str(train_type) + '_' + str(episode) + '.pt') f.close() return model
done = env.done state = [] for car in cars: state += [car.position.x, car.position.y] state.append(int(done == True)) return state if not SUBMISSION: ### Sample test cases. test_config = [{ 'lanes': [LaneSpec(0, [-2, -1])] * 5, 'width': 9, 'gamma': 0.9, 'seed': 15, 'fin_pos': Point(0, 0), 'agent_pos': Point(8, 4), 'stochasticity': 1. }, { 'lanes': [LaneSpec(1, [-2, -1])] * 2, 'width': 4, 'gamma': 0.9, 'seed': 15, 'fin_pos': Point(0, 1), 'agent_pos': Point(3, 1), 'stochasticity': 1. }, { 'lanes': [LaneSpec(1, [-3, -1])] * 2 + [LaneSpec(0, [0, 0])], 'width': 4, 'gamma': 0.9, 'seed': 100,
def heuristic_reward(): # parse_sas_plan(pos_memo) task2_env = construct_task2_env() n_lanes, n_width, agent_speed_range = len(task2_env.lanes), task2_env.width, task2_env.agent_speed_range pos_memo = [[0 if (x == 0 and y == 0) else None for y in range(n_lanes)] for x in range(n_width)] lanes = [LaneSpec(0, [0, 0])] * n_lanes for start_x in list(range(n_width))[::-1]: for start_y in list(range(n_lanes))[::-1]: if pos_memo[start_x][start_y] is None: print("Start x: {} start y: {}".format(start_x, start_y)) env = gym.make('GridDriving-v0', lanes=lanes, width=n_width, random_seed=42, agent_speed_range=(-3, -1), agent_pos_init=Point(x=start_x, y=start_y)) gen = initializeSystem(env) generateDomainPDDLFile(gen) generateProblemPDDLFile(gen) runPDDLSolver(gen) parse_sas_plan(pos_memo, start_x, start_y) print("pos_memo:") print(pos_memo) pos_inf_reward, neg_inf_reward = 10, -10 for x in range(n_width): for y in range(n_lanes): if pos_memo[x][y] == 0: pos_memo[x][y] = pos_inf_reward elif pos_memo[x][y] == -1: pos_memo[x][y] = neg_inf_reward else: pos_memo[x][y] = 1. / pos_memo[x][y] print("Final reward matrix") pos_memo = np.array(pos_memo) print(pos_memo) save_to_pickle(pos_memo, "reward_shaping.p")
### Sample test cases. test_config = [{'lanes' : [LaneSpec(1, [-1, -1])] *3,'width' :5, 'seed' : 10, 'iters': 300}, {'lanes' : [LaneSpec(2, [-2, -1])] *3,'width' :7, 'seed' : 15, 'iters': 100}, {'lanes' : [LaneSpec(2, [-2, -1])] *4,'width' :8, 'seed' : 125, 'iters': 500}, {'lanes' : [LaneSpec(2, [-3, -2])] *4,'width' :10, 'seed' : 44, 'iters': 300}, {'lanes' : [LaneSpec(2, [-3, -1])] *4,'width' :10, 'seed' : 125, 'iters': 400}, {'lanes' : [LaneSpec(2, [-3, -1])] *4,'width' :10, 'seed' : 25, 'iters': 300}] test_case_number = 5 #Change the index for a different test case LANES = test_config[test_case_number]['lanes'] WIDTH = test_config[test_case_number]['width'] RANDOM_SEED = test_config[test_case_number]['seed'] numiters = test_config[test_case_number]['iters'] stochasticity = 1. env = gym.make('GridDriving-v0', lanes=LANES, width=WIDTH, agent_speed_range=(-3,-1), finish_position=Point(0,0), #agent_ pos_init=Point(4,2), stochasticity=stochasticity, tensor_state=False, flicker_rate=0., mask=None, random_seed=RANDOM_SEED) actions = env.actions env.render() done = False mcts = MonteCarloTreeSearch(env=env, numiters=numiters, explorationParam=1.,random_seed=RANDOM_SEED) while not env.done: state = GridWorldState(env.state, is_done=done) action = mcts.buildTreeAndReturnBestAction(initialState=state) print (action) done = env.step(state=deepcopy(state.state), action=action)[2] env.render() if done == True: break print ("simulation done")
def getStateTuple(env): ''' Helper function to convert an env state to a state feature vector. ''' cars = env.cars done = env.done state = [] for car in cars: state += [car.position.x, car.position.y] state.append(int(done==True)) return state if not SUBMISSION: ### Sample test cases. test_config = [{'lanes' : [LaneSpec(0, [-2, -1])] *5,'width' :9, 'gamma' : 0.9, 'seed' : 15, 'fin_pos' : Point(0,0), 'agent_pos': Point(8,4),'stochasticity': 1. }, {'lanes' : [LaneSpec(1, [-2, -1])] *2,'width' :4, 'gamma' : 0.9, 'seed' : 15, 'fin_pos' : Point(0,1), 'agent_pos': Point(3,1),'stochasticity': 1. }, {'lanes' : [LaneSpec(1, [-3, -1])] *2 + [LaneSpec(0, [0, 0])],'width' :4, 'gamma' : 0.9, 'seed' : 100, 'fin_pos' : Point(0,0), 'agent_pos': Point(3,2),'stochasticity': .5 }, {'lanes' : [LaneSpec(0, [0, 0])] + [LaneSpec(1, [-3, -1])] *2,'width' :4, 'gamma' : 0.5, 'seed' : 128, 'fin_pos' : Point(0,0), 'agent_pos': Point(3,2),'stochasticity': 0.75 }, {'lanes' : [LaneSpec(1, [-3, -1])] *2 + [LaneSpec(0, [0, 0])],'width' :4, 'gamma' : 0.99, 'seed' : 111, 'fin_pos' : Point(0,0), 'agent_pos': Point(3,2),'stochasticity': .5 }, {'lanes' : [LaneSpec(1, [-3, -1]), LaneSpec(0, [0, 0]), LaneSpec(1, [-3, -1])] ,'width' :4, 'gamma' : 0.999, 'seed' : 125, 'fin_pos' : Point(0,0), 'agent_pos': Point(3,2),'stochasticity': 0.9 }] test_case_number = 0 #Change the index for a different test case LANES = test_config[test_case_number]['lanes'] WIDTH = test_config[test_case_number]['width'] RANDOM_SEED = test_config[test_case_number]['seed'] GAMMA = test_config[test_case_number]['gamma'] FIN_POS = test_config[test_case_number]['fin_pos'] AGENT_POS = test_config[test_case_number]['agent_pos'] stochasticity = test_config[test_case_number]['stochasticity'] env = gym.make('GridDriving-v0', lanes=LANES, width=WIDTH,