Ejemplo n.º 1
0
    def __init__(self, state_size, action_size, seed, alpha, gamma, tau):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.alpha = alpha
        self.gamma = gamma
        self.tau = tau

        # Q Learning Network
        self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.alpha)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Ejemplo n.º 2
0
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.env = env
        self.buffer = ReplayBuffer()
        self.num_action = self.env.get_action_space().n
        self.cur_step = 0
        self.greedyPolicy = EpsilonGreedyStrategy(1, 0.025, 0.01)
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.num_episode = args.num_episode
        self.learning_rate = args.learning_rate
        self.sample_batch_size = args.sample_batch_size
        self.gamma = args.gamma
        self.e = 1
        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
Ejemplo n.º 3
0
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.env = env
        self.batch_size = BATCH_SIZE
        self.gamma = 0.999
        self.eps_start = EPS_START
        self.eps_decay = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE

        self.policy_net = DQN(self.env.action_space.n)
        self.target_net = DQN(self.env.action_space.n)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        if use_cuda:
            self.policy_net.cuda()
            self.target_net.cuda()

        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-5)
        self.memory = deque(maxlen=10000)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
    def __init__(self, env, args):
        # Parameters for q-learning

        super(Agent_DQN, self).__init__()

        self.env = env
        state = env.reset()
        state = state.transpose(2, 0, 1)

        self.policy_net = DQN(state.shape,
                              self.env.action_space.n)  # Behavior Q
        self.target_net = DQN(state.shape, self.env.action_space.n)  # Target Q
        self.target_net.load_state_dict(self.policy_net.state_dict())
        #Initial Q

        if USE_CUDA:
            print("Using CUDA . . .     ")
            self.policy_net = self.policy_net.cuda()
            self.target_net = self.target_net.cuda()

        print('hyperparameters and network initialized')

        if args.test_dqn or LOAD == True:
            print('loading trained model')
            checkpoint = torch.load('trainData')
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])

        self.target_net.load_state_dict(self.policy_net.state_dict())
Ejemplo n.º 5
0
 def __init__(self, params, player_n=0):
     # unpack the parameters:
     #### simulation
     self.device = params["device"]
     self.env_name = params["env_name"]
     self.training_frames = params["training_frames"]
     self.skip_frames = params["skip_frames"]
     self.nactions = params["nactions"]
     self.messages_enabled = params["messages_enabled"]
     self.selfplay = params["selfplay"]
     #### qnet model
     self.learning_rate = params["learning_rate"]
     self.sync = params["sync"]
     self.load_from = params["load_from"]
     #### buffer
     self.batch_size = params["batch_size"]
     self.replay_size = params["replay_size"]
     self.nstep = params["nstep"]
     #### agent model
     self.gamma = params["gamma"]
     self.eps_start = params["eps_start"]
     self.eps_end = params["eps_end"]
     self.eps_decay_rate = params["eps_decay_rate"]
     self.player_n = player_n
     self.double = params["double"]
     # initialize the simulation with shared properties
     self.env = gym.make(
         self.env_name
     )  # environment, agent etc. can"t be created jointly in a server simulation
     self.net = DQN(self.env.observation_space.shape[0],
                    self.nactions**2).to(self.device)
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # initializations for replay memory
        self.env = env
        self.buffer = collections.deque(
            maxlen=REPLAY_SIZE)  # initializing a replay memory buffer

        #initializations of agent
        self._reset()
        self.last_action = 0
        self.net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE)
        self.target_net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE)
        LOAD_MODEL = True

        if args.test_dqn:
            #you can load your model here
            print('preparing to load trained model')
            ###########################
            LOAD_MODEL = True

        if LOAD_MODEL:
            self.net.load_state_dict(
                torch.load(MODEL, map_location=lambda storage, loc: storage))
            print('loaded trained model')
            self.target_net.load_state_dict(self.net.state_dict())
Ejemplo n.º 7
0
def main():
    config = Config()
    config.mode = 'test'
    config.dropout = 1.0
    model = DQN(config)
    model.initialize()
    rewards = evaluate_policy(model, config.T, config.N)
    print(np.mean(rewards))
    def __init__(self, env, args):

        # Training Parameters
        self.args = args
        self.env = env
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.gamma = args.gamma_reward_decay
        self.n_actions = env.action_space.n
        self.output_logs = args.output_logs
        self.step = 8e6
        self.curr_step = 0
        self.ckpt_path = args.save_dir
        self.epsilon = args.eps_start
        self.eps_end = args.eps_end
        self.target_update = args.update_target
        self.observe_steps = args.observe_steps
        self.explore_steps = args.explore_steps
        self.saver_steps = args.saver_steps
        self.resume = args.resume
        self.writer = TensorboardSummary(self.args.log_dir).create_summary()
        # Model Settings

        self.cuda = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.policy_net = DQN(4, self.n_actions)
        self.target_net = DQN(4, self.n_actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        if self.cuda:
            self.policy_net.to(self.cuda)
            self.target_net.to(self.cuda)

        self.target_net.eval()
        train_params = self.policy_net.parameters()
        self.optimizer = optim.RMSprop(train_params,
                                       self.lr,
                                       momentum=0.95,
                                       eps=0.01)
        self.memory = ReplayMemory(args.replay_memory_size)

        if args.resume:
            if not os.path.isfile(args.resume):
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    args.resume))
            checkpoint = torch.load(args.resume)

            self.epsilon = checkpoint['epsilon']
            self.curr_step = checkpoint['step']

            self.policy_net.load_state_dict(checkpoint['policy_state_dict'])
            self.target_net.load_state_dict(checkpoint['target_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['episode']))
Ejemplo n.º 9
0
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        #Gym parameters
        self.num_actions = env.action_space.n

        # parameters for repaly buffer
        self.buffer_max_len = 20000
        self.buffer = deque(maxlen=self.buffer_max_len)
        self.episode_reward_list = []
        self.moving_reward_avg = []

        # paramters for neural network
        self.batch_size = 32
        self.gamma = 0.999
        self.eps_threshold = 0
        self.eps_start = 1
        self.eps_end = 0.025
        self.max_expisode_decay = 10000
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        #Training
        self.steps_done = 0
        self.num_episode = 20000
        self.target_update = 5000
        self.learning_rate = 1.5e-4

        # Neural Network
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.learning_rate)

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            self.policy_net = torch.load('policy_net.hb5')
            self.policy_net.eval()
Ejemplo n.º 10
0
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        self.action = env.get_action_space()

        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Using device:', self.device)
        self.model = DQN().to(self.device)
        self.model_target = DQN().to(self.device)
        self.episode = 100000
        self.max_steps_per_episode = 14000
        self.update_target_network = 10000
        self.epsilon = 1.0
        self.min_epsilon = 0.1
        self.step_epsilon = (self.epsilon - self.min_epsilon) / (1E6)
        self.env = env
        self.history = []
        self.buffer_size = min(args.history_size // 5, 2000)
        self.history_size = args.history_size
        self.learning_rate = 1e-4
        self.name = args.name
        self.batch_size = 32
        self.gamma = 0.99
        self.priority = []
        self.w = 144
        self.h = 256
        self.mode = args.mode
        self.delay = args.delay
        self.epoch = args.continue_epoch
        if args.test_dqn or self.epoch > 0:
            #you can load your model here
            print('loading trained model')
            ###########################
            self.model.load_state_dict(
                torch.load(self.name + '.pth', map_location=self.device))
            self.model_target.load_state_dict(
                torch.load(self.name + '.pth', map_location=self.device))
Ejemplo n.º 11
0
def train():
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    env = gym.make(DEFAULT_ENV_NAME)
    model = DQN(env.observation_space.shape, env.action_space.n, LEARNING_RATE)
    state = env.reset()
    c = collections.Counter()
    buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START
    total_rewards = []

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        while True:
            frame_idx += 1
            epsilon = max(EPSILON_FINAL,
                          EPSILON_START - frame_idx / EPSILON_DECAY_FRAMES)
            reward = agent.play_step(sess, model, epsilon)
            if reward is not None:
                total_rewards.append(reward)
                speed = (frame_idx - ts_frame) / (time.time() - ts)
                ts_frame = frame_idx
                ts = time.time()
                mean_reward = np.mean(total_rewards[-100:])
                print(
                    "%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s"
                    % (frame_idx, len(total_rewards), mean_reward, epsilon,
                       speed))
                if len(buffer) < REPLAY_MIN_SIZE:
                    continue
                batch = buffer.sample(BATCH_SIZE)
                print(len(batch[0]))
                agent.fit_batch(sess, model, batch)
Ejemplo n.º 12
0
def infer(env, mlp=False):

    action_size = env.action_space.n

    if mlp:
        policy_net = MLP(num_actions=action_size)

    else:
        policy_net = DQN()

    policy_net.load_state_dict(
        torch.load('saved_weights/Episode 2078-checkpoint.pth'))

    for i in range(3):
        env.reset()
        state = get_screen(env)

        for j in range(200):
            action = policy_net(state).max(1)[1].view(1, 1)

            env.render()
            _, reward, done, _ = env.step(action.item())

            state = get_screen(env)
            if done:
                break

    env.close()
Ejemplo n.º 13
0
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.replay_buffer_size = 10000
        self.start_to_learn = 5000
        self.update_target_net = 5000
        self.learning_rate = 1.5e-4
        self.batch_size = 32
        self.buffer_ = deque(maxlen=self.replay_buffer_size)
        self.epsilon = 0.99
        self.epsilon_ = 0.2

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            self.Net = DQN()
Ejemplo n.º 14
0
    def _create_model(self):
        """
            Create a deep Q model for function approximation with Adam optimizer.

            :return: net, tgt_net, optimizer
            """
        tgt_net = DQN(self.env.observation_space.shape[0],
                      self.nactions**2).to(self.device)
        if self.load_from is not None:
            assert type(
                self.load_from
            ) == str, "Name of model to be loaded has to be a string!"
            self.net.load_state_dict(torch.load(self.load_from))
            tgt_net.load_state_dict(torch.load(self.load_from))
        optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate)
        return tgt_net, optimizer
Ejemplo n.º 15
0
    def __init__(self,state_size, action_size
                 ,batch_size,learn_step_size,buffer_size
                 ,gamma , learning_rate, tau
                 ,seed):
        """
        Intialize the agent and its learning parameter set.

        Parameters
        =========
        state_size (int): Size of the state space
        action_size (int): Size of the action space

        batch_size (int): Size of the batch size used in each learning step
        learn_step_size (int): Number of steps until agent ist trained again
        buffer_size (int): Size of replay memory buffer

        gamma (float): Discount rate that scales future discounts
        learning_rate (float): Learning rate of neural network
        tau (float): Update strenght between local and target network

        seed (float): Random set for initialization
        """

        # ----- Parameter init -----
        # State and action size from environment
        self.state_size = state_size
        self.action_size = action_size

        # Replay buffer and learning properties
        self.batch_size      = batch_size
        self.learn_step_size = learn_step_size
        self.gamma = gamma
        self.tau  = tau

        # General
        self.seed = random.seed(seed)


        # ----- Network and memory init -----
        # Init identical NN as local and target networks and set optimizer
        self.qnetwork_local  = DQN(state_size, action_size, seed).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate)

        # Initialize replay memory and time step (for updating every learn_step_size steps)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0
Ejemplo n.º 16
0
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.memory = []
        self.env = env
        self.n_actions = env.env.action_space.n
        self.policy_net = DQN(4, self.n_actions).to(device).float()
        self.target_net = DQN(4, self.n_actions).to(device).float()
        # self.policy_net.load_state_dict(torch.load("best_weights_model.pt"))
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.eps_threshold = EPS_START
        self.args = args
        self.test_count = 0
        self.max_reward = 0
        self.max_reward_so_far = 0
        self.reward_buffer = []
        self.flag = 0
        self.steps_done = 0
        # self.target_net.eval()
        self.transition = []
        self.test_mean_reward = 0

        self.optimizer = torch.optim.Adam(self.policy_net.parameters(),
                                          lr=LEARNING_RATE)

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            self.policy_net.load_state_dict(
                torch.load("best_weights_model.pt",
                           map_location=torch.device('cpu')))
Ejemplo n.º 17
0
def main(env, mlp=False):

    action_size = env.action_space.n

    if mlp:
        policy_net = MLP(num_actions=action_size)
        target_net = MLP(num_actions=action_size)

    else:
        policy_net = DQN()
        target_net = DQN()

    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(
        policy_net.parameters(),
        lr=0.00025)  # optim.Adam(policy_net.parameters(), lr=5e-4)
    memory = ReplayMemory(int(1e5))

    #     print('done')
    scores = train_dqn(env=env,
                       policy_net=policy_net,
                       target_net=target_net,
                       memory=memory,
                       GAMMA=GAMMA,
                       BATCH_SIZE=BATCH_SIZE,
                       EPS_START=EPS_START,
                       EPS_END=EPS_END,
                       EPS_DECAY=EPS_DECAY,
                       TARGET_UPDATE=TARGET_UPDATE,
                       n_episodes=n_episodes,
                       get_screen=get_screen,
                       optimizer=optimizer)

    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()
    print('Complete')
Ejemplo n.º 18
0
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """

        super(Agent_DQN, self).__init__(env)
        self.env = env
        self.args = args
        self.episode = 0
        self.n_actions = self.env.action_space.n
        self.epsilon_start = 1.0
        self.epsilon_final = 0.025
        self.epsilon_decay = 3000
        self.epsilon_by_frame = lambda frame_idx: self.epsilon_final + (
            self.epsilon_start - self.epsilon_final) * math.exp(
                -1. * frame_idx / self.epsilon_decay)
        self.epsilon = 0
        self.eval_net = DQN().cuda()
        self.target_net = DQN().cuda()
        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.criterion = nn.MSELoss()
        #self._model = Net(self.env.observation_space.shape, self.env.action_space.n)
        self._use_cuda = torch.cuda.is_available()
        self.optim = torch.optim.Adam(self.eval_net.parameters(),
                                      lr=self.args.learning_rate)

        if self._use_cuda:
            self.eval_net = self.eval_net.cuda()
            self.target_net = self.target_net.cuda()
            self.criterion = self.criterion.cuda()

#       self.replaybuffer = ReplayBuffer(args.buffer_size)
        self.buffer = deque(maxlen=10000)
        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            self.eval_net.load_state_dict(torch.load(args.model_dqn))
            self.target_net.load_state_dict(self.eval_net.state_dict())
            if self._use_cuda:
                self.eval_net = self.eval_net.cuda()
                self.target_net = self.target_net.cuda()
Ejemplo n.º 19
0
    def __init__(self, env, test=False):
        self.cuda = torch.device('cuda')
        print("Using device: " + torch.cuda.get_device_name(self.cuda),
              flush=True)

        self.env = env
        self.state_shape = env.observation_space.shape
        self.n_actions = env.action_space.n

        self.memory = deque(maxlen=100000)
        self.batch_size = 32
        self.mem_threshold = 50000

        self.gamma = 0.99

        self.learning_rate = 1e-4

        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_period = 10000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.epsilon_period

        self.update_rate = 4
        self.omega = 100

        self.start_epoch = 1
        self.epochs = 1
        self.epoch = 20000

        self.model = DQN(self.state_shape, self.n_actions).to(self.cuda)
        print("DQN parameters: {}".format(count_parameters(self.model)))

        self.target = DQN(self.state_shape, self.n_actions).to(self.cuda)
        self.target.eval()
        self.target_update = 10000

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        if test:
            self.model.load_state_dict(torch.load('model.pt'))
Ejemplo n.º 20
0
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN,self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.epochs = 10
        self.n_episodes = 1000000
        self.env = env
        self.nA = self.env.action_space.n
        # self.nS = self.env.observation_space
        self.batch_size = 32
        self.DQN = DQN()
        self.Target_DQN = DQN()
        self.buffer_memory = 1000000
        self.train_buffer_size = 4
        self.min_buffer_size = 10000
        self.target_update_buffer = 10000
        self.learning_rate = 0.0001
        self.discount_factor = 0.999
        self.epsilon = 1
        self.min_epsilon = 0.01
        # self.decay_rate = 0.999
        self.ep_decrement = (self.epsilon - self.min_epsilon)/self.n_episodes
        self.criteria = nn.MSELoss()
        self.optimiser = optim.Adam(self.DQN.parameters(),self.learning_rate)
        self.buffer=[]
        self.Evaluation = 100000
        self.total_evaluation__episodes = 100
        self.full_train = 100000
        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
Ejemplo n.º 21
0
    def __init__(self, FLAGS):

        self.FLAGS = FLAGS
        self.env = gym.make('CartPole-v1')
        self.state_size = len(self.env.observation_space.sample())
        self.num_episodes = 1000

        self.exp_replay = ExperienceReplay()

        target_network = DQN(scope='target',
                             env=self.env,
                             target_network=None,
                             flags=FLAGS,
                             exp_replay=None)
        self.q_network = DQN(scope='q_network',
                             env=self.env,
                             target_network=target_network,
                             flags=FLAGS,
                             exp_replay=self.exp_replay)

        init = tf.global_variables_initializer()
        session = tf.InteractiveSession()
        session.run(init)

        self.q_network.set_session(session)
        target_network.set_session(session)
Ejemplo n.º 22
0
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """

        super(Agent_DQN, self).__init__(env)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')

        ##################
        # YOUR CODE HERE #
        ##################
        self.env = env
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.eps_start = EPS_START
        self.eps_decay = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE

        self.policy_net = DQN(self.env.action_space.n)
        self.target_net = DQN(self.env.action_space.n)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.policy_net.to(device)
        self.target_net.to(device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1.5e-4)
        self.memory = ReplayMemory(10000)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
            self.policy_net.load_state_dict(
                torch.load(os.path.join('save_dir/'
                                        'model-best.pth'),
                           map_location=torch.device('cpu')))
            self.policy_net.eval()
Ejemplo n.º 23
0
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN,self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.epsilon_start = 1
        self.epsilon_end = 0.02
        self.epsilon_decay = 200000
        self.epsilon = self.epsilon_start
        
        self.gamma = 0.99
        self.env = env
        
        self.buffer_size = 30000
        self.buffer = deque(maxlen=30000)
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(),lr=0.00015)
        self.reward_array = []
        self.reward_x_axis = []
        self.batch_size = 32
        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
class Agent_DQN():
    def __init__(self, env, args):
        # Parameters for q-learning

        super(Agent_DQN, self).__init__()

        self.env = env
        state = env.reset()
        state = state.transpose(2, 0, 1)

        self.policy_net = DQN(state.shape,
                              self.env.action_space.n)  # Behavior Q
        self.target_net = DQN(state.shape, self.env.action_space.n)  # Target Q
        self.target_net.load_state_dict(self.policy_net.state_dict())
        #Initial Q

        if USE_CUDA:
            print("Using CUDA . . .     ")
            self.policy_net = self.policy_net.cuda()
            self.target_net = self.target_net.cuda()

        print('hyperparameters and network initialized')

        if args.test_dqn or LOAD == True:
            print('loading trained model')
            checkpoint = torch.load('trainData')
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])

        self.target_net.load_state_dict(self.policy_net.state_dict())

    def init_game_setting(self):
        print('loading trained model')
        checkpoint = torch.load('trainData')
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        memory.append((state, action, reward, next_state, done))

    def replay_buffer(self):
        state, action, reward, next_state, done = zip(
            *random.sample(memory, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(
            next_state), done

    def __len__(self):
        return len(self.buffer)

    def make_action(self, observation, test=True):

        observation = observation.transpose(2, 0, 1)

        if np.random.random() > EPSILON or test == True:
            observation = Variable(torch.FloatTensor(
                np.float32(observation)).unsqueeze(0),
                                   volatile=True)
            q_value = self.policy_net.forward(observation)
            action = q_value.max(1)[1].data[0]
            action = int(action.item())
        else:
            action = random.randrange(4)
        return action

    def optimize_model(self):

        states, actions, next_states, rewards, dones = self.replay_buffer()

        states_v = Variable(torch.FloatTensor(np.float32(states)))
        next_states_v = Variable(torch.FloatTensor(np.float32(next_states)),
                                 volatile=True)
        actions_v = Variable(torch.LongTensor(actions))
        rewards_v = Variable(torch.FloatTensor(rewards))
        done = Variable(torch.FloatTensor(dones))

        state_action_values = self.policy_net(states_v).gather(
            1, actions_v.unsqueeze(1)).squeeze(1)
        next_state_values = self.target_net(next_states_v).max(1)[0]
        expected_q_value = rewards_v + next_state_values * GAMMA * (
            1 - done)  #+ rewards_v

        loss = (state_action_values -
                Variable(expected_q_value.data)).pow(2).mean()
        return loss

    def train(self):
        optimizer = optim.Adam(self.policy_net.parameters(), lr=ALPHA)

        # Fill the memory with experiences
        print('Gathering experiences ...')
        meanScore = 0
        AvgRewards = []
        AllScores = []
        step = 1
        iEpisode = 0

        while meanScore < 50:

            state = self.env.reset()
            done = False
            EpisodeScore = 0
            tBegin = time.time()
            done = False

            while not done:

                action = self.make_action(state)
                nextState, reward, done, _ = self.env.step(action)
                self.push(state.transpose(2, 0, 1), action,
                          nextState.transpose(2, 0, 1), reward, done)

                state = nextState

                if len(memory) > StartLearning:
                    loss = self.optimize_model()
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                else:
                    iEpisode = 0
                    continue

                # Update exploration factor
                EPSILON = EPS_END + (EPS_START - EPS_END) * math.exp(
                    -1. * step / EPS_DECAY)
                storeEpsilon.append(EPSILON)
                step += 1

                EpisodeScore += reward

                if step % TARGET_UPDATE == 0:
                    print('Updating Target Network . . .')
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())

            iEpisode += 1
            AllScores.append(EpisodeScore)
            meanScore = np.mean(AllScores[-100:])
            AvgRewards.append(meanScore)

            if len(memory) > StartLearning:
                print('Episode: ', iEpisode, ' score:', EpisodeScore,
                      ' Avg Score:', meanScore, ' epsilon: ', EPSILON, ' t: ',
                      time.time() - tBegin, ' loss:', loss.item())
            else:
                print('Gathering Data . . .')

            if iEpisode % 500 == 0:
                torch.save(
                    {
                        'epoch': iEpisode,
                        'model_state_dict': self.policy_net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss,
                        'AvgRewards': AvgRewards
                    }, 'trainData')

                os.remove("Rewards.csv")
                with open('Rewards.csv', mode='w') as dataFile:
                    rewardwriter = csv.writer(dataFile,
                                              delimiter=',',
                                              quotechar='"',
                                              quoting=csv.QUOTE_MINIMAL)
                    rewardwriter.writerow(AvgRewards)

        print('======== Complete ========')
        torch.save(
            {
                'epoch': iEpisode,
                'model_state_dict': self.policy_net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                'AvgRewards': AvgRewards
            }, 'trainData')

        with open('Rewards.csv', mode='w') as dataFile:
            rewardwriter = csv.writer(dataFile,
                                      delimiter=',',
                                      quotechar='"',
                                      quoting=csv.QUOTE_MINIMAL)
            rewardwriter.writerow(AvgRewards)
    experience_replay_buffer = ReplayMemory()
    episode_rewards = np.zeros(num_episodes)
    episode_lens = np.zeros(num_episodes)

    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_change = (epsilon - epsilon_min) / 500000

    env = gym.make('gym_quantum_pong:Quantum_Pong-v0')

    #monitor_dir = 'video'
    #env = wrappers.Monitor(env, monitor_dir)

    model = DQN(K=K,
                conv_layer_sizes=conv_layer_sizes,
                hidden_layer_sizes=hidden_layer_sizes,
                scope="model",
                image_size=IM_SIZE)

    target_model = DQN(K=K,
                       conv_layer_sizes=conv_layer_sizes,
                       hidden_layer_sizes=hidden_layer_sizes,
                       scope="target_model",
                       image_size=IM_SIZE)

    image_transformer = ImageTransformer(IM_SIZE)

    with tf.Session() as sess:
        model.set_session(sess)
        target_model.set_session(sess)
        #model.load()
Ejemplo n.º 26
0
class Simulation:
    """
    Simulation for the game of 3D Pong.

    Parameters
    ----------
    params: dict
            Dictionary of all the simulation parameters
    """
    def __init__(self, params, player_n=0):
        # unpack the parameters:
        #### simulation
        self.device = params["device"]
        self.env_name = params["env_name"]
        self.training_frames = params["training_frames"]
        self.skip_frames = params["skip_frames"]
        self.nactions = params["nactions"]
        self.messages_enabled = params["messages_enabled"]
        self.selfplay = params["selfplay"]
        #### qnet model
        self.learning_rate = params["learning_rate"]
        self.sync = params["sync"]
        self.load_from = params["load_from"]
        #### buffer
        self.batch_size = params["batch_size"]
        self.replay_size = params["replay_size"]
        self.nstep = params["nstep"]
        #### agent model
        self.gamma = params["gamma"]
        self.eps_start = params["eps_start"]
        self.eps_end = params["eps_end"]
        self.eps_decay_rate = params["eps_decay_rate"]
        self.player_n = player_n
        self.double = params["double"]
        # initialize the simulation with shared properties
        self.env = gym.make(
            self.env_name
        )  # environment, agent etc. can"t be created jointly in a server simulation
        self.net = DQN(self.env.observation_space.shape[0],
                       self.nactions**2).to(self.device)

    def _create_environment(self):
        """
            create a gym environment for the simulation.

            Actions are discretized into nactions and frames are skipped for faster training
            :return: env
            """
        env = gym.make(self.env_name)
        if self.selfplay:
            env.unwrapped.multiplayer(env,
                                      game_server_guid="selfplayer",
                                      player_n=self.player_n)
        env = wrappers.action_space_discretizer(env, n=self.nactions)
        env = wrappers.SkipEnv(env, skip=self.skip_frames)
        return env

    def _create_agent(self, env):
        """
            Create agent with buffer for the simulation.

            :return: agent
            """
        # buffer = ExperienceBuffer(self.replay_size)
        buffer = Extendedbuffer(self.replay_size,
                                nstep=self.nstep,
                                gamma=self.gamma)
        agent = pongagent.Pongagent(env, self.player_n, buffer)
        return agent

    def _create_model(self):
        """
            Create a deep Q model for function approximation with Adam optimizer.

            :return: net, tgt_net, optimizer
            """
        tgt_net = DQN(self.env.observation_space.shape[0],
                      self.nactions**2).to(self.device)
        if self.load_from is not None:
            assert type(
                self.load_from
            ) == str, "Name of model to be loaded has to be a string!"
            self.net.load_state_dict(torch.load(self.load_from))
            tgt_net.load_state_dict(torch.load(self.load_from))
        optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate)
        return tgt_net, optimizer

    def _init_non_shared(self, player_n):
        env = self._create_environment()
        tgt_net, optimizer = self._create_model()
        agent = self._create_agent(env)
        writer = SummaryWriter(
            comment="-" + "player" + str(player_n) + "batch" +
            str(self.batch_size) + "_n" + str(env.action_space.n) + "_eps" +
            str(self.eps_decay_rate) + "_skip" + str(self.skip_frames) +
            "learning_rate" + str(self.learning_rate))
        return env, agent, tgt_net, optimizer, writer

    def _fill_buffer(self, agent):
        if self.messages_enabled:
            print("Player populating Buffer ...")
        agent.exp_buffer.fill(agent.env, self.replay_size, self.nstep)
        if self.messages_enabled:
            print("Buffer_populated!")

    def train(self, net, player_n=0):
        self.net = net
        env, agent, tgt_net, optimizer, writer = self._init_non_shared(
            player_n)
        self._fill_buffer(agent)
        if self.messages_enabled:
            print("Player %i start training: " % player_n)
        reward = []
        for frame in range(self.training_frames):
            epsilon = max(self.eps_end,
                          self.eps_start - frame / self.eps_decay_rate)
            ep_reward = agent.play_step(net, epsilon, self.device)
            if ep_reward:
                reward.append(ep_reward)
                writer.add_scalar("episode_reward", ep_reward, frame)
                writer.add_scalar("mean100_reward", np.mean(reward[-100:]),
                                  frame)
            if (frame % self.sync) == 0:
                tgt_net.load_state_dict(
                    net.state_dict())  # Syncs target and Standard net
                if self.messages_enabled:
                    print("We are at: %7i / %7i frames" %
                          (frame, self.training_frames))
                if player_n == 0:
                    torch.save(net.state_dict(),
                               self.env_name + "-time_update.dat")

            optimizer.zero_grad()
            batch = agent.exp_buffer.sample(self.batch_size)
            loss_t = calc_loss(batch, net, tgt_net, self.gamma**self.nstep,
                               self.double, self.device)
            loss_t.backward()
            optimizer.step()

            writer.add_scalar("loss", loss_t, frame)
            writer.add_scalar("epsilon", epsilon, frame)

        writer.close()
        if self.messages_enabled:
            print("Player %i end training!" % player_n)
        torch.save(net.state_dict(), self.env_name + "end_of_training.dat")

        return np.mean(reward[-len(reward) // 2:])

    # TODO: clean this function!
    def run(self, mode="play"):
        """
        runs the simulation.
        :param mode: str, either "play" or "train"
        :return: mean reward over all episodes with eps_end
        """
        if mode == "train":
            reward = self.train(self.net)
            return reward
        elif mode == "play":
            # Run play.py to see model in action
            pass

        else:
            raise Exception("Mode should be either play or train")
Ejemplo n.º 27
0
class Agent_DQN():
    def __init__(self, env, test=False):
        self.cuda = torch.device('cuda')
        print("Using device: " + torch.cuda.get_device_name(self.cuda),
              flush=True)

        self.env = env
        self.state_shape = env.observation_space.shape
        self.n_actions = env.action_space.n

        self.memory = deque(maxlen=100000)
        self.batch_size = 32
        self.mem_threshold = 50000

        self.gamma = 0.99

        self.learning_rate = 1e-4

        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_period = 10000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.epsilon_period

        self.update_rate = 4

        self.start_epoch = 1
        self.epochs = 10
        self.epoch = 10000

        self.model = DQN(self.state_shape, self.n_actions).to(self.cuda)
        print("DQN parameters: {}".format(count_parameters(self.model)))

        self.target = DQN(self.state_shape, self.n_actions).to(self.cuda)
        self.target.eval()
        self.target_update = 10000

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)
        if test:
            self.model.load_state_dict(torch.load('model.pt'))

    def init_game_setting(self):
        pass

    def make_action(self, observation, test=False):
        epsilon = 0.01 if test else self.epsilon
        # turn action into tensor
        observation = torch.tensor(observation,
                                   device=self.cuda,
                                   dtype=torch.float)
        # turn off learning
        self.model.eval()
        # epsilon greedy policy
        if random.random() > epsilon:
            # no need to calculate gradient
            with torch.no_grad():
                # choose highest value action
                b = self.model(observation)
                b = b.cpu().data.numpy()
                action = np.random.choice(
                    np.flatnonzero(np.isclose(b, b.max())))
        else:
            # random action
            action = random.choice(np.arange(self.n_actions))
        # turn learning back on
        self.model.train()
        return action

    def replay_buffer(self):
        # Return tuple of sars transitions
        states, actions, rewards, next_states, dones = zip(
            *random.sample(self.memory, self.batch_size))
        states = torch.tensor(np.vstack(states),
                              device=self.cuda,
                              dtype=torch.float)
        actions = torch.tensor(np.array(actions),
                               device=self.cuda,
                               dtype=torch.long)
        rewards = torch.tensor(np.array(rewards, dtype=np.float32),
                               device=self.cuda,
                               dtype=torch.float)
        next_states = torch.tensor(np.vstack(next_states),
                                   device=self.cuda,
                                   dtype=torch.float)
        dones = torch.tensor(np.array(dones, dtype=np.float32),
                             device=self.cuda,
                             dtype=torch.float)
        return states, actions, rewards, next_states, dones

    def experience_replay(self, n=0):
        # clamp gradient
        clamp = False
        # Reset gradient (because it accumulates by default)
        self.optimizer.zero_grad()
        # sample experience memory
        states, actions, rewards, next_states, dones = self.replay_buffer()
        # get Q(s,a) for sample
        Q = self.model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        # get max_a' Q(s',a')
        Q_prime = self.target(next_states).detach().max(1)[0]
        # calculate y = r + gamma * max_a' Q(s',a') for non-terminal states
        Y = rewards + (self.gamma * Q_prime) * (1 - dones)
        # Huber loss of Q and Y
        loss = F.smooth_l1_loss(Q, Y)
        # Compute dloss/dx
        loss.backward()
        # Clamp gradient
        if clamp:
            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
        # Change the weights
        self.optimizer.step()

    def train(self):
        step = 0
        learn_step = 0
        print("Begin Training:", flush=True)
        learn_curve = []
        last30 = deque(maxlen=30)
        for epoch in range(self.start_epoch, self.epochs + 1):
            durations = []
            rewards = []
            flag = []
            # progress bar
            epoch_bar = tqdm(range(self.epoch), total=self.epoch, ncols=200)
            for episode in epoch_bar:
                # reset state
                state = self.env.reset()
                # decay epsilon
                if self.epsilon > self.epsilon_min:
                    self.epsilon -= self.epsilon_decay
                # run one episode
                done = False
                ep_duration = 0
                ep_reward = 0
                while not done:
                    step += 1
                    ep_duration += 1
                    # get epsilon-greedy action
                    action = self.make_action(state)
                    # do action
                    next_state, reward, done, info = self.env.step(action)
                    ep_reward += reward
                    # add transition to replay memory
                    self.memory.append(
                        Transition(state, action, reward, next_state, done))
                    state = next_state
                    # learn from experience, if available
                    if step % self.update_rate == 0 and len(
                            self.memory) > self.mem_threshold:
                        self.experience_replay(learn_step)
                        learn_step += 1
                    # update target network
                    if step % self.target_update == 1:
                        self.target.load_state_dict(self.model.state_dict())

                durations.append(ep_duration)
                rewards.append(ep_reward)
                last30.append(ep_reward)
                learn_curve.append(np.mean(last30))
                flag.append(info['flag_get'])
                epoch_bar.set_description(
                    "epoch {}/{}, avg duration = {:.2f}, avg reward = {:.2f}, last30 = {:2f}"
                    .format(epoch, self.epochs, np.mean(durations),
                            np.mean(rewards), learn_curve[-1]))
            # save model every epoch
            plt.clf()
            plt.plot(learn_curve)
            plt.title(f"DQN Epoch {epoch} with {save_prefix} Reward")
            plt.xlabel('Episodes')
            plt.ylabel('Moving Average Reward')
            if not os.path.exists(f"{save_prefix}_DQN"):
                os.mkdir(f"{save_prefix}_DQN")
            torch.save(self.model.state_dict(),
                       f'{save_prefix}_DQN/DQN_model_ep{epoch}.pt')
            pickle.dump(
                rewards,
                open(f"{save_prefix}_DQN/DQN_reward_ep{epoch}.pkl", 'wb'))
            pickle.dump(flag,
                        open(f"{save_prefix}_DQN/flag_ep{epoch}.pkl", 'wb'))
            plt.savefig(f"{save_prefix}_DQN/epoch{epoch}.png")
            learn_curve = []
Ejemplo n.º 28
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    model = DQN(in_channels=input_arg, num_actions=num_actions)
    target_Q = DQN(in_channels=input_arg, num_actions=num_actions)

    if USE_CUDA:
        target_Q = target_Q.cuda()
        model = model.cuda()

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(model.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        ##### OUR CODE
        idx = replay_buffer.store_frame(last_obs)
        encoded_obs = replay_buffer.encode_recent_observation()

        if t > learning_starts:
            action = select_epilson_greedy_action(model, encoded_obs, t)[0]
        else:
            action = random.randrange(num_actions)

        obs, reward, done, info = env.step(action)
        reward = max(-1.0, min(reward, 1.0))

        replay_buffer.store_effect(idx, action, reward, done)
        if done:
            obs = env.reset()
        last_obs = obs
        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable

            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)

            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)

            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            act_batch = Variable(
                torch.Tensor(act_batch).type(torch.LongTensor))
            rew_batch = Variable(torch.from_numpy(rew_batch))

            done_mask = Variable(
                torch.Tensor([1. if val == 0 else 0. for val in done_mask]))
            if USE_CUDA:
                done_mask = done_mask.cuda()
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()
                obs_batch = obs_batch.cuda()
                next_obs_batch = next_obs_batch.cuda()

            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).

            # We choose Q based on action taken.
            current_Q_values = model(obs_batch).gather(
                1, act_batch.unsqueeze(1))  #[0, act_batch]
            # 5. Obtain maxQ' and set our target value for chosen action using the bellman equation.
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = torch.mul(done_mask, next_max_q)

            target_Q_values = rew_batch + (gamma * next_Q_values)
            if USE_CUDA:
                target_Q_values = target_Q_values.cuda()
            d_error = target_Q_values.unsqueeze(1) - current_Q_values
            d_error = d_error.clamp(-1, 1) * -1

            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.

            optimizer.zero_grad()
            current_Q_values.backward(d_error)
            optimizer.step()
            num_param_updates += 1

            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(model.state_dict())

            # YOUR CODE HERE

            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
            if hasattr(exploration, 'add_reward'):
                exploration.add_reward(episode_rewards)
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # Declare variables
        self.exp_id = uuid.uuid4().__str__().replace('-', '_')
        self.args = args
        self.env = env
        self.eps_threshold = None
        self.nA = env.action_space.n
        self.action_list = np.arange(self.nA)
        self.reward_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.max_q_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.loss_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.probability_list = np.zeros(env.action_space.n, np.float32)
        self.cur_eps = self.args.eps
        self.t = 0
        self.ep_len = 0
        self.mode = None
        if self.args.use_pri_buffer:
            self.replay_buffer = NaivePrioritizedBuffer(
                capacity=self.args.capacity, args=self.args)
        else:
            self.replay_buffer = ReplayBuffer(capacity=self.args.capacity,
                                              args=self.args)
        self.position = 0

        self.args.save_dir += f'/{self.exp_id}/'
        os.system(f"mkdir -p {self.args.save_dir}")
        self.meta = MetaData(fp=open(
            os.path.join(self.args.save_dir, 'result.csv'), 'w'),
                             args=self.args)
        self.eps_delta = (self.args.eps -
                          self.args.eps_min) / self.args.eps_decay_window
        self.beta_by_frame = lambda frame_idx: min(
            1.0, args.pri_beta_start + frame_idx *
            (1.0 - args.pri_beta_start) / args.pri_beta_decay)

        # Create Policy and Target Networks
        if self.args.use_dueling:
            print("Using dueling dqn . . .")
            self.policy_net = DuelingDQN(env, self.args).to(self.args.device)
            self.target_net = DuelingDQN(env, self.args).to(self.args.device)
        elif self.args.use_crnn:
            print("Using dueling crnn . . .")
            self.policy_net = CrnnDQN(env).to(self.args.device)
            self.target_net = CrnnDQN(env).to(self.args.device)
        else:
            self.policy_net = DQN(env, self.args).to(self.args.device)
            self.target_net = DQN(env, self.args).to(self.args.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.args.lr,
                                    eps=self.args.optimizer_eps)
        if self.args.lr_scheduler:
            print("Enabling LR Decay . . .")
            self.scheduler = optim.lr_scheduler.ExponentialLR(
                optimizer=self.optimizer, gamma=self.args.lr_decay)
        self.cur_lr = self.optimizer.param_groups[0]['lr']

        # Compute Huber loss
        self.loss = F.smooth_l1_loss

        # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370
        self.policy_net.share_memory()
        self.target_net.share_memory()

        # Set defaults for networks
        self.policy_net.train()
        self.target_net.eval()
        self.target_net.load_state_dict(self.policy_net.state_dict())

        if args.test_dqn:
            # you can load your model here
            ###########################
            # YOUR IMPLEMENTATION HERE #
            print('loading trained model')
            self.load_model()

        if args.use_pri_buffer:
            print('Using priority buffer . . .')
        if args.use_double_dqn:
            print('Using double dqn . . .')

        if args.use_bnorm:
            print("Using batch normalization . . .")

        print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n')
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # Declare variables
        self.exp_id = uuid.uuid4().__str__().replace('-', '_')
        self.args = args
        self.env = env
        self.eps_threshold = None
        self.nA = env.action_space.n
        self.action_list = np.arange(self.nA)
        self.reward_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.max_q_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.loss_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.probability_list = np.zeros(env.action_space.n, np.float32)
        self.cur_eps = self.args.eps
        self.t = 0
        self.ep_len = 0
        self.mode = None
        if self.args.use_pri_buffer:
            self.replay_buffer = NaivePrioritizedBuffer(
                capacity=self.args.capacity, args=self.args)
        else:
            self.replay_buffer = ReplayBuffer(capacity=self.args.capacity,
                                              args=self.args)
        self.position = 0

        self.args.save_dir += f'/{self.exp_id}/'
        os.system(f"mkdir -p {self.args.save_dir}")
        self.meta = MetaData(fp=open(
            os.path.join(self.args.save_dir, 'result.csv'), 'w'),
                             args=self.args)
        self.eps_delta = (self.args.eps -
                          self.args.eps_min) / self.args.eps_decay_window
        self.beta_by_frame = lambda frame_idx: min(
            1.0, args.pri_beta_start + frame_idx *
            (1.0 - args.pri_beta_start) / args.pri_beta_decay)

        # Create Policy and Target Networks
        if self.args.use_dueling:
            print("Using dueling dqn . . .")
            self.policy_net = DuelingDQN(env, self.args).to(self.args.device)
            self.target_net = DuelingDQN(env, self.args).to(self.args.device)
        elif self.args.use_crnn:
            print("Using dueling crnn . . .")
            self.policy_net = CrnnDQN(env).to(self.args.device)
            self.target_net = CrnnDQN(env).to(self.args.device)
        else:
            self.policy_net = DQN(env, self.args).to(self.args.device)
            self.target_net = DQN(env, self.args).to(self.args.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.args.lr,
                                    eps=self.args.optimizer_eps)
        if self.args.lr_scheduler:
            print("Enabling LR Decay . . .")
            self.scheduler = optim.lr_scheduler.ExponentialLR(
                optimizer=self.optimizer, gamma=self.args.lr_decay)
        self.cur_lr = self.optimizer.param_groups[0]['lr']

        # Compute Huber loss
        self.loss = F.smooth_l1_loss

        # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370
        self.policy_net.share_memory()
        self.target_net.share_memory()

        # Set defaults for networks
        self.policy_net.train()
        self.target_net.eval()
        self.target_net.load_state_dict(self.policy_net.state_dict())

        if args.test_dqn:
            # you can load your model here
            ###########################
            # YOUR IMPLEMENTATION HERE #
            print('loading trained model')
            self.load_model()

        if args.use_pri_buffer:
            print('Using priority buffer . . .')
        if args.use_double_dqn:
            print('Using double dqn . . .')

        if args.use_bnorm:
            print("Using batch normalization . . .")

        print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n')

    def init_game_setting(self):
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        with torch.no_grad():
            if self.args.test_dqn:
                q, argq = self.policy_net(
                    Variable(
                        self.channel_first(observation))).data.cpu().max(1)
                return self.action_list[argq]
            # Fill up probability list equal for all actions
            self.probability_list.fill(self.cur_eps / self.nA)
            # Fetch q from the model prediction
            q, argq = self.policy_net(Variable(
                self.channel_first(observation))).data.cpu().max(1)
            # Increase the probability for the selected best action
            self.probability_list[argq[0].item()] += 1 - self.cur_eps
            # Use random choice to decide between a random action / best action
            action = torch.tensor(
                [np.random.choice(self.action_list, p=self.probability_list)])

        ###########################
        return action.item(), q.item()

    def optimize_model(self):
        """
        Function to perform optimization on DL Network
        :return: Loss
        """
        # Return if initial buffer is not filled.
        if len(self.replay_buffer.memory) < self.args.mem_init_size:
            return 0
        if self.args.use_pri_buffer:
            batch_state, batch_action, batch_next_state, batch_reward, batch_done, indices, weights = self.replay_buffer.sample(
                self.args.batch_size, beta=self.beta_by_frame(self.t))
        else:
            batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.replay_buffer.sample(
                self.args.batch_size)
        batch_state = Variable(
            self.channel_first(
                torch.tensor(np.array(batch_state), dtype=torch.float32)))
        batch_action = Variable(
            torch.tensor(np.array(batch_action), dtype=torch.long))
        batch_next_state = Variable(
            self.channel_first(
                torch.tensor(np.array(batch_next_state), dtype=torch.float32)))
        batch_reward = Variable(
            torch.tensor(np.array(batch_reward), dtype=torch.float32))
        batch_done = Variable(
            torch.tensor(np.array(batch_done), dtype=torch.float32))
        policy_max_q = self.policy_net(batch_state).gather(
            1, batch_action.unsqueeze(1)).squeeze(1)
        if self.args.use_double_dqn:
            policy_ns_max_q = self.policy_net(batch_next_state)
            next_q_value = self.target_net(batch_next_state).gather(
                1,
                torch.max(policy_ns_max_q, 1)[1].unsqueeze(1)).squeeze(1)
            target_max_q = next_q_value * self.args.gamma * (1 - batch_done)
        else:
            target_max_q = self.target_net(batch_next_state).detach().max(
                1)[0].squeeze(0) * self.args.gamma * (1 - batch_done)
        # Compute Huber loss
        if self.args.use_pri_buffer:
            loss = (policy_max_q -
                    (batch_reward + target_max_q.detach())).pow(2) * Variable(
                        torch.tensor(weights, dtype=torch.float32))
            prios = loss + 1e-5
            loss = loss.mean()
        else:
            loss = self.loss(policy_max_q, batch_reward + target_max_q)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        # Clip gradients between -1 and 1
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)

        if self.args.use_pri_buffer:
            self.replay_buffer.update_priorities(indices,
                                                 prios.data.cpu().numpy())

        self.optimizer.step()
        return loss.cpu().detach().numpy()

    def train(self):
        """
        Implement your training algorithm here
        """

        ###########################
        # YOUR IMPLEMENTATION HERE #
        def train_fn():
            self.t = 1
            self.mode = "Random"
            train_start = time.time()
            if not self.args.load_dir == '':
                self.load_model()
            for i_episode in range(1, self.args.max_episodes + 1):
                # Initialize the environment and state
                start_time = time.time()
                state = self.env.reset()
                self.reward_list.append(0)
                self.loss_list.append(0)
                self.max_q_list.append(0)
                self.ep_len = 0
                done = False

                # Save Model
                self.save_model(i_episode)
                # Collect garbage
                self.collect_garbage(i_episode)

                # Run the game
                while not done:
                    # Update the target network, copying all weights and biases in DQN
                    if self.t % self.args.target_update == 0:
                        print("Updating target network . . .")
                        self.target_net.load_state_dict(
                            self.policy_net.state_dict())
                    # Select and perform an action
                    self.cur_eps = max(self.args.eps_min,
                                       self.cur_eps - self.eps_delta)
                    if self.cur_eps == self.args.eps_min:
                        self.mode = 'Exploit'
                    else:
                        self.mode = "Explore"
                    action, q = self.make_action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    self.reward_list[-1] += reward
                    self.max_q_list[-1] = max(self.max_q_list[-1], q)
                    # Store the transition in memory
                    self.replay_buffer.push(state, action, next_state, reward,
                                            done)
                    self.meta.update_step(self.t, self.cur_eps,
                                          self.reward_list[-1],
                                          self.max_q_list[-1],
                                          self.loss_list[-1], self.cur_lr)

                    # Increment step and Episode Length
                    self.t += 1
                    self.ep_len += 1

                    # Move to the next state
                    state = next_state

                    # Perform one step of the optimization (on the target network)
                    if self.ep_len % self.args.learn_freq == 0:
                        loss = self.optimize_model()
                        self.loss_list[-1] += loss
                self.loss_list[-1] /= self.ep_len

                # Decay Step:
                if self.args.lr_scheduler:
                    self.cur_lr = self.scheduler.get_lr()[0]
                    if i_episode % self.args.lr_decay_step == 0 and self.cur_lr > self.args.lr_min:
                        self.scheduler.step(i_episode)

                # Update meta
                self.meta.update_episode(
                    i_episode, self.t,
                    time.time() - start_time,
                    time.time() - train_start, self.ep_len,
                    len(self.replay_buffer.memory),
                    self.cur_eps, self.reward_list[-1],
                    np.mean(self.reward_list), self.max_q_list[-1],
                    np.mean(self.max_q_list), self.loss_list[-1],
                    np.mean(self.loss_list), self.mode, self.cur_lr)

        import multiprocessing as mp
        processes = []
        for rank in range(4):
            p = mp.Process(target=train_fn)
            p.start()
            processes.append(p)
        for p in processes:
            p.join()