Ejemplo n.º 1
0
    def __init__(self, cfg, tetris):
        self.num_actions = cfg.MODEL.SIZE_ACTION
        self.gamma = cfg.SOLVER.GAMMA
        self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE

        transition = namedtuple('Transicion',
                                ('state', 'action', 'next_state', 'reward'))
        self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition)
        self.model = get_model(cfg)

        self.target_net = copy.deepcopy(self.model)
        self.target_net.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        self.tetris = tetris
Ejemplo n.º 2
0
 def __init__(
         self,
         network: nn.Module,
         actions: int,
         logger: Optional = None,
         learning_rate: float = 0.00025,
         replay_start_size: int = 50000,
         replay_size: int = 1000000,
         batch_size: int = 32,
         sync_target_step: int = 10000,
         update_frequency: int = 4,
         gradient_clipping: bool = False,
         reward_clipping: bool = True,
         gamma: float = 0.99,
         epsilon_start: float = 1.0,
         epsilon_end: float = 0.1,
         epsilon_end_step: int = 1000000,
         epsilon_testing: float = 0.05,
         training: bool = True,
         device: str = 'gpu',
         seed: Optional[int] = None
 ):
     """
     Initializes a DQN agent
     
     Args:
         network: a neural network to learn the Q-function
         actions: number of actions the agent can take
         logger: a logger that has a write method which receives scalars and a timestep
         learning_rate: the learning rate for the optimizer
         replay_start_size: minimum number of samples in memory before optimization starts, is also the
             number of time steps taken before reducing epsilon
         replay_size: maximum size of the replay buffer
         batch_size: number of samples for each parameter update
         sync_target_step: number of policy updates before updating the target network parameters
         update_frequency: number of time steps between each learning step
         gradient_clipping: if True, the gradients are clipped between -1 and 1
         reward_clipping: if True, the rewards are clipped between -1 and 1
         gamma: the discount factor for the MDP
         epsilon_start: value of epsilon at start of training
         epsilon_end: value of epsilon at end of training
         epsilon_end_step: number of time steps where the epsilon is linearly decayed
         epsilon_testing: value of epsilon during testing
         training: if True the agent is training if False is testing
         device: device to be used in pytorch, either gpu` or `cpu`
         seed: the random seed
     """
     
     if seed is not None:
         torch.random.manual_seed(seed)
     
     # selecting the device to use
     self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu")
     print(f"Using {self._device}...")
     
     # creating the target network, eval doesn't do anything since we are not using dropout
     self._policy_network = network.to(self._device)
     self._target_network = deepcopy(self._policy_network).to(self._device)
     self._target_network.eval()
     
     # saving the logger
     if logger is not None:
         self._logger = logger
     
     # initializing the optimizer and saving some optimization related parameters
     self._learning_rate = learning_rate
     # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate)
     self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015)
     # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015)
     self._batch_size = batch_size
     self._sync_target_step = sync_target_step
     self._update_frequency = update_frequency
     self._gradient_clipping = gradient_clipping
     self._loss_fn = torch.nn.L1Loss(reduction="none")
     self._reward_clipping = reward_clipping
     
     # setting the action space
     self._actions = actions
     self._num_steps = 0
     
     # setting the replay buffer
     self._replay_start_size = replay_start_size
     self._replay_size = replay_size
     self._memory = ReplayMemory(size=replay_size, seed=seed)
     
     # setting the MDP parameters
     self._gamma = gamma
     
     # setting the exploration parameters
     self._epsilon_end = epsilon_end
     self._epsilon_diff = epsilon_start - epsilon_end
     self._epsilon_end_step = epsilon_end_step
     self._epsilon_testing = epsilon_testing
     self._epsilon = epsilon_start
     
     # setting the training status
     self._training = training
     
     self._timestep = None
     self._next_timestep = None
Ejemplo n.º 3
0
# helper method for reshaping the cartpole observation
def reshape(state):
    return np.reshape(state, [1, 4])


if __name__ == '__main__':
    tf.compat.v1.disable_eager_execution()
    max_score = 0

    n_episodes = 5000
    max_env_steps = 1000

    env = gym.make('CartPole-v0')
    agent = DQNAgent(env=env,
                     net=NN(alpha=0.001, decay=0.0001),
                     memory=ReplayMemory(size=100000))

    if max_env_steps is not None:
        env._max_episode_steps = max_env_steps

    for e in range(n_episodes):
        # reset the env
        state = reshape(env.reset())
        done = False
        score = 0
        # play until env done
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            # env.render()
            next_state = reshape(next_state)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout')
    parser.add_argument('--env', default='Breakout-v0', help='Atari env name')
    parser.add_argument(
        '-o', '--output', default='atari-v0', help='Directory to save data to')
    parser.add_argument('--seed', default=0, type=int, help='Random seed')

    parser.add_argument('--input_shape', nargs=2, type=int, default=None,
                        help='Input shape')
    parser.add_argument('--num_frame', default=4, type=int,
                        help='Number of frames in a state')
    parser.add_argument('--discount', default=0.99, type=float,
                        help='Discount factor gamma')

    parser.add_argument('--online_train_interval', default=4, type=int,
                        help='Interval to train the online network')
    parser.add_argument('--target_reset_interval', default=10000, type=int,
                        help='Interval to reset the target network')
    parser.add_argument('--action_change_interval', default=1, type=int,
                        help='Interval to change action')
    parser.add_argument('--print_loss_interval', default=100, type=int,
                        help='Interval to print losses')

    parser.add_argument('--replay_buffer_size', default=100000, type=int,
                        help='Replay buffer size')
    parser.add_argument('--num_burn_in', default=25000, type=int,
                        help='Number of samples filled in memory before update')
    parser.add_argument('--batch_size', default=32, type=int,
                        help='How many samples in each minibatch')

    parser.add_argument('--learning_rate', default=1e-4, type=float,
                        help='Learning rate alpha')
    parser.add_argument('--explore_prob', default=0.05, type=float,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--decay_prob_start', default=1.0, type=float,
                        help='Starting probability in linear-decay epsilon-greedy')
    parser.add_argument('--decay_prob_end', default=0.1, type=float,
                        help='Ending probability in linear-decay epsilon-greedy')
    parser.add_argument('--decay_steps', default=1000000, type=int,
                        help='Decay steps in linear-decay epsilon-greedy')

    parser.add_argument('--num_train', default=5000000, type=int,
                        help='Number of training sampled interactions with the environment')
    parser.add_argument('--max_episode_length', default=999999, type=int,
                        help='Maximum length of an episode')
    parser.add_argument('--save_interval', default=100000, type=int,
                        help='Interval to save weights and memory')

    parser.add_argument('--model_name', default='dqn', type=str,
                        help='Model name')

    parser.add_argument('--eval_interval', default=10000, type=int,
                        help='Evaluation interval')
    parser.add_argument('--eval_episodes', default=20, type=int,
                        help='Number of episodes in evaluation')

    parser.add_argument('--double_q', default=False, type=bool,
                        help='Invoke double Q net')

    parser.add_argument('--do_render', default=False, type=bool,
                        help='Do rendering or not')

    parser.add_argument('--read_weights', default=None, type=str,
                        help='Read weights from file')
    parser.add_argument('--read_memory', default=None, type=str,
                        help='Read memory from file')

    args = parser.parse_args()
    print '########## All arguments ##########:', args
    args.input_shape = tuple(args.input_shape)
    args.output = get_output_folder(args.output, args.env)

    env = gym.make(args.env)
    num_actions = env.action_space.n
    opt_adam = Adam(lr=args.learning_rate)

    model_online = create_model(args.num_frame, args.input_shape,
        num_actions, model_name=args.model_name)
    model_target = create_model(args.num_frame, args.input_shape,
        num_actions, model_name=args.model_name)

    q_network = {'online': model_online, 'target': model_target}

    preproc = AtariPreprocessor(args.input_shape)
    memory = ReplayMemory(args.replay_buffer_size, args.num_frame)

    policy_random = UniformRandomPolicy(num_actions)
    policy_train = LinearDecayGreedyEpsilonPolicy(args.decay_prob_start,
                                                  args.decay_prob_end,
                                                  args.decay_steps)
    policy_eval = GreedyEpsilonPolicy(args.explore_prob)
    policy = {'random': policy_random, 'train': policy_train, 'eval': policy_eval}

    agent = DQNAgent(num_actions, q_network, preproc, memory, policy, args)
    agent.compile([mean_huber_loss, null_loss], opt_adam)

    if args.read_weights is not None:
        agent.q_network['online'].load_weights(args.read_weights)
    if args.read_memory is not None:
        with open(args.read_memory, 'rb') as save_memory:
            agent.memory = pickle.load(save_memory)

    print '########## training #############'
    agent.fit(env)
Ejemplo n.º 5
0
import gym

from dqn.bots import AtariBot
from dqn.policy import DDQNPolicy
from dqn.memory import ReplayMemory

GAME = 'Breakout-v0'

#TODO List params to tune here, eventually migrate this to a readme

if __name__ == "__main__":
    policy = DDQNPolicy()
    memory = ReplayMemory()
    game = gym.make(GAME)
    game.ale.setInt(b'frame_skip', 4)
    robot = AtariBot(policy=policy, memory=memory)
    robot.train(game=game, ckpt_dir="models")