Ejemplo n.º 1
0
    def __init__(self,
                 env,
                 lr=0.8,
                 y=0.95,
                 step_cost=.0,
                 living_cost=.0,
                 episode_length=100,
                 memory_capacity=100,
                 batch_size=25,
                 eps=0.5,
                 eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height

        self.nn = Model(in_features=2,
                        hidden=[self.state_len, self.state_len],
                        out_features=len(Agent.actions))

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size
Ejemplo n.º 2
0
 def __init__(self,
              env,
              model,
              lr=0.8,
              y=0.95,
              step_cost=.0,
              living_cost=.0,
              episode_length=100,
              memory_capacity=100,
              batch_size=10,
              eps=0.5,
              eps_decay=0.999):
     AbstractAgent.__init__(self, eps, eps_decay)
     self.env = env
     self.model = model
     self.lr = lr
     self.y = y
     self.step_cost = step_cost
     self.living_cost = living_cost
     self.s0 = env.field.index('s')
     self.episode_length = episode_length
     self.rewards = []
     self.losses = []
     self.memory = ReplayMemory(memory_capacity)
     self.batch_size = batch_size
Ejemplo n.º 3
0
    def __init__(self, inputs, n_actions):
        self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain.load_state_dict(self.brain.state_dict())
        self.target_brain.eval()

        self.set_params()
        self.optimizer = torch.optim.Adam(self.brain.parameters())
        self.memory = ReplayMemory(50000)
        self.action_space = [0, 1]
Ejemplo n.º 4
0
    def __init__(self, env, input_size, output_size, hidden_size, mix_hidden = 32, batch_size = 128, lr = 0.001, gamma = .999, eps_start = 0.9, 
                 eps_end = 0.05, eps_decay = 750,  replay_capacity = 10000, num_save = 200, num_episodes = 10000, mode="random", training = False, load_file = None):
        self.env = env
        self.orig_env = copy.deepcopy(env)
        self.grid_map = env.grid_map
        self.cars = env.grid_map.cars
        self.num_cars = len(self.cars)
        self.passengers = env.grid_map.passengers
        self.num_passengers = len(self.passengers)
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
        self.replay_capacity = replay_capacity
        self.num_episodes = num_episodes
        self.steps_done = 0
        self.lr = lr
        self.mode = mode
        self.num_save = num_save
        self.training = training
        self.algorithm = PairAlgorithm()
        self.episode_durations = []
        self.loss_history = []
        
        self.memory = ReplayMemory(self.replay_capacity)
        
        self.device = torch.device("cpu")#"cuda:0" if torch.cuda.is_available() else 
        print("Device being used:", self.device)
        self.policy_net = DQN(self.input_size, self.output_size , self.hidden_size).to(self.device)
        
        self.params = list(self.policy_net.parameters())

        
        if self.mode == "qmix":
            self.mixer = QMixer(self.input_size, self.num_passengers, mix_hidden).to(self.device)
            self.params += list(self.mixer.parameters())
            
        
        if load_file:
            self.policy_net.load_state_dict(torch.load(load_file))
            self.policy_net.eval()
            if self.mode == "qmix":
                self.mixer.load_state_dict(torch.load("mixer_" + load_file))
                self.mixer.eval()
            self.load_file = "Trained_" + load_file
            print("Checkpoint loaded")
        else:         
            self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \
                    "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth"
            
        self.optimizer = optim.RMSprop(self.params, lr = self.lr)
Ejemplo n.º 5
0
    def __init__(self):
        # self.config = config
        self.gamma = 0.4

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 1700

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            # print_cuda_statistics()
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = "/home/sk002/Desktop/model/"
Ejemplo n.º 6
0
    def testReplayMemory(self):
        od = [84, 84, 4]
        ad = [8, 10]
        rd = [5]
        s = int(10000)
        b = 32

        rm = ReplayMemory(obs_dim=od, act_dim=ad, r_dim=rd, size=s)
        o = self.get_rand(od)
        a = self.get_rand(ad)
        r = self.get_rand(rd)
        d = 0
        for _ in range(1000):
            rm.store(o, a, r, o, d)

        o_s, a_s, r_s, on_s, d_s = rm.sample(b)

        self.assertEqual(o_s.shape, combined_shape(b, od))
        self.assertEqual(a_s.shape, combined_shape(b, ad))
        self.assertEqual(r_s.shape, combined_shape(b, rd))
        self.assertEqual(on_s.shape, combined_shape(b, od))
        self.assertEqual(d_s.shape, combined_shape(b))
Ejemplo n.º 7
0
    def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100,
                 memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        q = (1.0 - p) / 2
        self.stochastic_actions = {
            '←': [[0, 2, 3], [p, q, q]],
            '→': [[1, 2, 3], [p, q, q]],
            '↑': [[2, 0, 1], [p, q, q]],
            '↓': [[3, 0, 1], [p, q, q]]
        }
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height
        self.nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn.load_state_dict(self.nn.state_dict())
        self.target_nn.eval()

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size
        self.target_update = target_update
Ejemplo n.º 8
0
    def __init__(self,
                 name,
                 others=None,
                 last_n=10,
                 load_path=None,
                 checkpoint=5000,
                 fixed_strategy=False,
                 eps_decay=0.00005):
        if others is None:
            others = [1, 2]
        self.others = others
        self.last_n = last_n
        self.prev_points = 0
        self.batch_size = 32
        self.gamma = 0.9
        self.eps_start = 1
        self.eps_end = 0.01
        self.eps_decay = eps_decay
        self.target_update = 100
        self.plot_at = 1000
        self.q_max = []
        self.q_list = []
        self.checkpoint = checkpoint
        self.memory_size = 1000
        self.lr = 0.00001
        self.train = True

        self.input_dim = len(others) * 6
        self.output_dim = 3
        self.current_step = 1
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.memory = ReplayMemory(self.memory_size)

        # Initialize the policy and target networks
        self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        if load_path is not None:
            checkpoint = torch.load(load_path)
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])
            self.policy_net.eval()
            self.eps_start = 0
            self.eps_end = 0
            self.train = False
        if fixed_strategy:
            self.strategy = FixedStrategy()
        self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end,
                                              self.eps_decay)

        # Set the optimizer
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        self.loss = None

        # Push to replay memory
        self.prev_state = None
        self.action = None
        self.reward = None
        self.current_state = None

        super().__init__(name)
Ejemplo n.º 9
0
    # Initialize environment and config.
    env = gym.make(args.env)

    env_config = ENV_CONFIGS[args.env]
    env = gym.wrappers.AtariPreprocessing(env,
                                          screen_size=84,
                                          grayscale_obs=True,
                                          frame_skip=1,
                                          noop_max=30,
                                          scale_obs=True)
    # Initialize deep Q-networks.
    dqn = DQN(env_config=env_config).to(device)
    # TODO: Create and initialize target Q-network.
    target_dqn = DQN(env_config=env_config).to(device)
    # Create replay memory.
    memory = ReplayMemory(env_config['memory_size'])

    # Initialize optimizer used for training the DQN. We use Adam rather than RMSProp.
    optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr'])

    # Keep track of best evaluation mean return achieved so far.
    best_mean_return = -float("Inf")
    for episode in range(env_config['n_episodes']):
        done = False
        obs = preprocess(env.reset(), envID=args.env, env=env).unsqueeze(0)
        obs_stack = torch.cat(env_config['obs_stack_size'] *
                              [obs]).unsqueeze(0).to(device)
        count = 0
        while not done:
            # TODO: Get action from DQN.
            action = dqn.act(obs_stack)
Ejemplo n.º 10
0
    def __init__(self):
        # self.config = config
        self.gamma = 0.75

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(),
                                      lr=0.0001)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 250

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = os.path.join(os.getcwd(), "model") + "/"
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)

        t = time.localtime()
        self.save_tensorboard_path = os.path.join(
            os.getcwd(), "tensorboard_record") + "/run_" + time.strftime(
                "%d_%m_%Y_%H_%M", t) + "/"
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)
        self.writer = SummaryWriter(self.save_tensorboard_path)
Ejemplo n.º 11
0
                            dtype=torch.long)


if __name__ == "__main__":
    BATCH_SIZE = 128
    GAMMA = 0.999
    EPS_START = 0.9
    EPS_END = 0.05
    EPS_DECAY = 200
    TARGET_UPDATE = 10
    MAX_T = 9999
    steps_done = 0
    timer = Timer()
    rect = util.get_screen_rect()
    region = (rect[0], rect[1], rect[2] - rect[0], rect[3] - rect[1])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    monitor = Monitor(device, region)
    env = gym.make("Game-v0")
    init_screen = monitor.get_screen(pytorch=True)
    _, _, height, width = init_screen.shape

    n_actions = env.action_space.n
    policy_net = DQN(width, height, n_actions).to(device)
    target_net = DQN(width, height, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = torch.optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(3000)
    simulate()
    ACTION_BUILD_BARRACKS,
    ACTION_ATTACK,
    ACTION_SELECT_BARRACKS,
    ACTION_BUILD_MARINE,


]

KILL_UNIT_REWARD = 0.2
KILL_BUILDING_REWARD = 0.5

reward_check = []

model = DQN(6, 8)
optimizer = optim.RMSprop(model.parameters(), 1e-3)
memory = ReplayMemory(10000)


class DQNAgent(base_agent.BaseAgent):
    def __init__(self):
        super(DQNAgent, self).__init__()
        self.previous_state = None
        self.previous_action = None
        self.model = model
        self.memory = memory
        self.optimizer = optimizer
        self.diagnostics = [0, 0, 0, 0, 0, 0, 0, 0]

        self.base_top_left = None
        self.supply_depot_built = False
        self.scv_selected = False
Ejemplo n.º 13
0
def train(args):
    device = torch.device("cuda" if args.gpu else "cpu")
    env = Environment(draw=False,
                      fps=args.fps,
                      debug=args.debug,
                      dist_to_pipe=args.dist_to_pipe,
                      dist_between_pipes=args.dist_between_pipes,
                      obs_this_pipe=args.obs_this_pipe)

    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    policy_network = DQN(observation_space, action_space).to(device)
    target_network = DQN(observation_space, action_space).to(device)

    optimizer = torch.optim.Adam(policy_network.parameters(), lr=args.lr)

    replay_buffer = ReplayMemory(args.replay_capacity)
    writer = SummaryWriter()

    if args.inference:
        target_network.load_checkpoint()

    best_reward = None
    iteration = 0
    total_reward = 0.0
    rewards = []
    state = env.reset()
    while True:
        epsilon = max(args.final_eps,
                      args.start_eps - iteration / args.eps_decay_final_step)

        iteration += 1
        episode_reward = None
        if np.random.rand() < epsilon:
            action = env.get_action_random()
        else:
            state_v = torch.tensor(np.array([state], copy=False)).to(device)
            q_vals_v = policy_network(state_v.float())
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        next_state, reward, done = env.step(action)
        total_reward += reward

        replay_buffer.push(state, action, next_state, reward, done)

        state = next_state

        if done:
            episode_reward = total_reward
            state = env.reset()
            total_reward = 0.0

        if episode_reward is not None:
            rewards.append(episode_reward)
            mean_reward = np.mean(rewards[-80:])
            print(
                f"Episode {iteration}:  eps {epsilon}  mean reward {mean_reward}  episode reward {episode_reward}"
            )

            writer.add_scalar("epsilon", epsilon, iteration)
            writer.add_scalar("mean_reward", mean_reward, iteration)
            writer.add_scalar("reward", episode_reward, iteration)

            if best_reward is None or best_reward < mean_reward:
                torch.save(policy_network.state_dict(),
                           f"./models/checkpoint_{iteration}")
                print(f"New best reward found: {best_reward} -> {mean_reward}")
                best_reward = mean_reward
            if mean_reward > args.goal_reward:
                print(f"Achieved in {iteration} steps.")
                break

        if len(replay_buffer) < args.replay_start_step:
            continue

        if iteration % args.target_update_iterations == 0:
            target_network.load_state_dict(policy_network.state_dict())

        optimizer.zero_grad()

        batch = replay_buffer.sample(args.batch_size)
        loss = calculate_loss(batch,
                              policy_network,
                              target_network,
                              args.gamma,
                              device=device)

        loss.backward()
        optimizer.step()
    writer.close()