Example #1
0
File: q_net.py Project: chz100p/lis
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  #original: 4

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        hidden_dim = 256
        self.model = FunctionSet(l4=F.Linear(self.dim * self.hist_size,
                                             hidden_dim,
                                             wscale=np.sqrt(2)),
                                 q_value=F.Linear(
                                     hidden_dim,
                                     self.num_of_actions,
                                     initialW=np.zeros(
                                         (self.num_of_actions, hidden_dim),
                                         dtype=np.float32)))
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(reward[i]) + self.gamma * max_q_dash[i]
            else:
                tmp_ = np.sign(reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions),
                            dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time, state, action, reward, state_dash,
                         episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                         self.dim),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                              self.dim),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state))
        q = self.model.q_value(h4 / 255.0)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        q = self.model_target.q_value(h4)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #2
0
class DN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 1, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Breakout"

        print "Initializing DN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            l4=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l5=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)),
            l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32)),
            q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True)
        ).to_gpu()
        
        if args.resumemodel:
            # load saved model
            serializers.load_npz(args.resumemodel, self.model)
            print "load model from resume.model"
        

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        if args.resumeD1 and args.resumeD2:
            # load saved D1 and D2
            npz_tmp1 = np.load(args.resumeD1)
            print "finished loading half of D data"
            npz_tmp2 = np.load(args.resumeD2)
            self.D = [npz_tmp1['D0'],
                      npz_tmp1['D1'],
                      npz_tmp1['D2'],
                      npz_tmp2['D3'],
                      npz_tmp2['D4']]
            npz_tmp1.close()
            npz_tmp2.close()
            print "loaded stored all D data"
        else:
            self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros(self.data_size, dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.int8),
                      np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.bool)]
            print "initialized D data"

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value
        # Generate Target Signals
        tmp2 = self.Q_func(s_dash)
        tmp2 = list(map(np.argmax, tmp2.data.get()))  # argmaxQ(s',a)
        tmp = self.Q_func_target(s_dash)  # Q'(s',*)
        tmp = list(tmp.data.get())
        # select Q'(s',*) due to argmaxQ(s',a)
        res1 = []
        for i in range(num_of_batch):
            res1.append(tmp[i][tmp2[i]])

        #max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        max_Q_dash = np.asanyarray(res1, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)
        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_
        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        print 'now Q_func is implemented'
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        h4 = F.relu(self.model.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model.l5(h3)) # right side connected with A value
        h6 = self.model.l6(h4) # s value
        h7 = self.model.l7(h5) # A value
        Q = self.model.q_value(h6, h7) # Q value
        return Q

    def Q_func_target(self, state):
        print 'now Q_func_target is implemented'
        h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model_target.l2(h1))
        h3 = F.relu(self.model_target.l3(h2))
        h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value
        h6 = self.model_target.l6(h4) # s value
        h7 = self.model_target.l7(h5) # A value
        Q = self.model_target.q_value(h6, h7) # Q value
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #3
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1 #original: 4

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        hidden_dim = 256
        self.model = FunctionSet(
            l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)),
            q_value=F.Linear(hidden_dim, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, hidden_dim),
                                               dtype=np.float32))
        )
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        q = self.model.q_value(h4)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        q = self.model_target.q_value(h4)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #4
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
        print "CUDA init"
        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)),
            l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)),
            l3=F.Linear(2592, 256),
            q_value=F.Linear(256,
                             self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32))).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002,
                                                  alpha=0.3,
                                                  momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time, state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0, 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"

        return self.index_to_action(index_action), Q

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #5
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor

    def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        w = math.sqrt(2)  # MSRA scaling
        self.model = FunctionSet(
            conv1=F.Convolution2D(3,   64,  7, wscale=w, stride=2, pad=3),
            conv2_1_1=F.Convolution2D(64,   64,  1, wscale=w, stride=1),
            conv2_1_2=F.Convolution2D(64,   64,  3, wscale=w, stride=1, pad=1),
            conv2_1_3=F.Convolution2D(64,  256,  1, wscale=w, stride=1),
            conv2_1_ex=F.Convolution2D(64,  256,  1, wscale=w, stride=1),
            conv2_2_1=F.Convolution2D(256,   64,  1, wscale=w, stride=1),
            conv2_2_2=F.Convolution2D(64,   64,  3, wscale=w, stride=1, pad=1),
            conv2_2_3=F.Convolution2D(64,  256,  1, wscale=w, stride=1),
            conv2_3_1=F.Convolution2D(256,   64,  1, wscale=w, stride=1),
            conv2_3_2=F.Convolution2D(64,   64,  3, wscale=w, stride=1, pad=1),
            conv2_3_3=F.Convolution2D(64,  256,  1, wscale=w, stride=1),
            conv3_1_1=F.Convolution2D(256,  128,  1, wscale=w, stride=2),
            conv3_1_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_1_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_1_ex=F.Convolution2D(256,  512,  1, wscale=w, stride=2),
            conv3_2_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_2_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_2_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_3_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_3_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_3_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_4_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_4_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_4_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_5_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_5_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_5_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_6_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_6_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_6_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_7_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_7_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_7_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_8_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_8_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_8_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv4_1_1=F.Convolution2D(512,  256,  1, wscale=w, stride=2),
            conv4_1_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_1_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_1_ex=F.Convolution2D(512,  1024,  1, wscale=w, stride=2),
            conv4_2_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_2_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_2_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_3_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_3_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_3_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_4_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_4_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_4_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_5_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_5_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_5_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_6_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_6_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_6_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_7_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_7_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_7_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_8_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_8_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_8_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_9_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_9_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_9_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_10_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_10_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_10_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_11_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_11_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_11_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_12_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_12_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_12_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_13_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_13_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_13_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_14_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_14_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_14_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_15_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_15_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_15_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_16_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_16_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_16_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_17_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_17_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_17_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_18_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_18_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_18_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_19_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_19_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_19_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_20_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_20_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_20_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_21_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_21_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_21_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_22_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_22_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_22_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_23_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_23_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_23_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_24_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_24_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_24_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_25_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_25_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_25_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_26_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_26_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_26_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_27_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_27_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_27_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_28_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_28_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_28_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_29_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_29_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_29_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_30_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_30_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_30_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_31_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_31_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_31_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_32_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_32_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_32_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_33_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_33_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_33_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_34_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_34_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_34_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_35_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_35_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_35_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_36_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_36_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_36_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv5_1_1=F.Convolution2D(1024,  512,  1, wscale=w, stride=2),
            conv5_1_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_1_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            conv5_1_ex=F.Convolution2D(1024,  2048,  1, wscale=w, stride=2),
            conv5_2_1=F.Convolution2D(2048,  512,  1, wscale=w, stride=1),
            conv5_2_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_2_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            conv5_3_1=F.Convolution2D(2048,  512,  1, wscale=w, stride=1),
            conv5_3_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_3_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            q_value=F.Linear(2048, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 2048),
                                               dtype=np.float32))
        )

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((num_of_batch, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def Q_func(self, state):
        h = F.relu(self.model.conv1(state))
        h = F.max_pooling_2d(h, 3, stride=2)

        h_rem = self.model.conv2_1_ex(h)
        h = F.relu(self.model.conv2_1_1(h))
        h = F.relu(self.model.conv2_1_2(h))
        h = self.model.conv2_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv2_2_1(h))
        h = F.relu(self.model.conv2_2_2(h))
        h = self.model.conv2_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv2_3_1(h))
        h = F.relu(self.model.conv2_3_2(h))
        h = self.model.conv2_3_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model.conv3_1_ex(h)
        h = F.relu(self.model.conv3_1_1(h))
        h = F.relu(self.model.conv3_1_2(h))
        h = self.model.conv3_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_2_1(h))
        h = F.relu(self.model.conv3_2_2(h))
        h = self.model.conv3_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_3_1(h))
        h = F.relu(self.model.conv3_3_2(h))
        h = self.model.conv3_3_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_4_1(h))
        h = F.relu(self.model.conv3_4_2(h))
        h = self.model.conv3_4_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_5_1(h))
        h = F.relu(self.model.conv3_5_2(h))
        h = self.model.conv3_5_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_6_1(h))
        h = F.relu(self.model.conv3_6_2(h))
        h = self.model.conv3_6_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_7_1(h))
        h = F.relu(self.model.conv3_7_2(h))
        h = self.model.conv3_7_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_8_1(h))
        h = F.relu(self.model.conv3_8_2(h))
        h = self.model.conv3_8_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model.conv4_1_ex(h)
        h = F.relu(self.model.conv4_1_1(h))
        h = F.relu(self.model.conv4_1_2(h))
        h = self.model.conv4_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_2_1(h))
        h = F.relu(self.model.conv4_2_2(h))
        h = self.model.conv4_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_3_1(h))
        h = F.relu(self.model.conv4_3_2(h))
        h = self.model.conv4_3_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_4_1(h))
        h = F.relu(self.model.conv4_4_2(h))
        h = self.model.conv4_4_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_5_1(h))
        h = F.relu(self.model.conv4_5_2(h))
        h = self.model.conv4_5_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_6_1(h))
        h = F.relu(self.model.conv4_6_2(h))
        h = self.model.conv4_6_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_7_1(h))
        h = F.relu(self.model.conv4_7_2(h))
        h = self.model.conv4_7_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_8_1(h))
        h = F.relu(self.model.conv4_8_2(h))
        h = self.model.conv4_8_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_9_1(h))
        h = F.relu(self.model.conv4_9_2(h))
        h = self.model.conv4_9_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_10_1(h))
        h = F.relu(self.model.conv4_10_2(h))
        h = self.model.conv4_10_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_11_1(h))
        h = F.relu(self.model.conv4_11_2(h))
        h = self.model.conv4_11_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_12_1(h))
        h = F.relu(self.model.conv4_12_2(h))
        h = self.model.conv4_12_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_13_1(h))
        h = F.relu(self.model.conv4_13_2(h))
        h = self.model.conv4_13_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_14_1(h))
        h = F.relu(self.model.conv4_14_2(h))
        h = self.model.conv4_14_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_15_1(h))
        h = F.relu(self.model.conv4_15_2(h))
        h = self.model.conv4_15_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_16_1(h))
        h = F.relu(self.model.conv4_16_2(h))
        h = self.model.conv4_16_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_17_1(h))
        h = F.relu(self.model.conv4_17_2(h))
        h = self.model.conv4_17_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_18_1(h))
        h = F.relu(self.model.conv4_18_2(h))
        h = self.model.conv4_18_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_19_1(h))
        h = F.relu(self.model.conv4_19_2(h))
        h = self.model.conv4_19_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_20_1(h))
        h = F.relu(self.model.conv4_20_2(h))
        h = self.model.conv4_20_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_21_1(h))
        h = F.relu(self.model.conv4_21_2(h))
        h = self.model.conv4_21_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_22_1(h))
        h = F.relu(self.model.conv4_22_2(h))
        h = self.model.conv4_22_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_23_1(h))
        h = F.relu(self.model.conv4_23_2(h))
        h = self.model.conv4_23_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_24_1(h))
        h = F.relu(self.model.conv4_24_2(h))
        h = self.model.conv4_24_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_25_1(h))
        h = F.relu(self.model.conv4_25_2(h))
        h = self.model.conv4_25_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_26_1(h))
        h = F.relu(self.model.conv4_26_2(h))
        h = self.model.conv4_26_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_27_1(h))
        h = F.relu(self.model.conv4_27_2(h))
        h = self.model.conv4_27_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_28_1(h))
        h = F.relu(self.model.conv4_28_2(h))
        h = self.model.conv4_28_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_29_1(h))
        h = F.relu(self.model.conv4_29_2(h))
        h = self.model.conv4_29_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_30_1(h))
        h = F.relu(self.model.conv4_30_2(h))
        h = self.model.conv4_30_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_31_1(h))
        h = F.relu(self.model.conv4_31_2(h))
        h = self.model.conv4_31_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_32_1(h))
        h = F.relu(self.model.conv4_32_2(h))
        h = self.model.conv4_32_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_33_1(h))
        h = F.relu(self.model.conv4_33_2(h))
        h = self.model.conv4_33_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_34_1(h))
        h = F.relu(self.model.conv4_34_2(h))
        h = self.model.conv4_34_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_35_1(h))
        h = F.relu(self.model.conv4_35_2(h))
        h = self.model.conv4_35_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_36_1(h))
        h = F.relu(self.model.conv4_36_2(h))
        h = self.model.conv4_36_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model.conv5_1_ex(h)
        h = F.relu(self.model.conv5_1_1(h))
        h = F.relu(self.model.conv5_1_2(h))
        h = self.model.conv5_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv5_2_1(h))
        h = F.relu(self.model.conv5_2_2(h))
        h = self.model.conv5_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv5_3_1(h))
        h = F.relu(self.model.conv5_3_2(h))
        h = self.model.conv5_3_3(h)
        h = F.relu(h + h_rem)

        h = F.average_pooling_2d(h, 7)
        Q = self.model.q_value(h)
        return Q

    def Q_func_target(self, state):
        h = F.relu(self.model_target.conv1(state))
        h = F.max_pooling_2d(h, 3, stride=2)

        h_rem = self.model_target.conv2_1_ex(h)
        h = F.relu(self.model_target.conv2_1_1(h))
        h = F.relu(self.model_target.conv2_1_2(h))
        h = self.model_target.conv2_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv2_2_1(h))
        h = F.relu(self.model_target.conv2_2_2(h))
        h = self.model_target.conv2_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv2_3_1(h))
        h = F.relu(self.model_target.conv2_3_2(h))
        h = self.model_target.conv2_3_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model_target.conv3_1_ex(h)
        h = F.relu(self.model_target.conv3_1_1(h))
        h = F.relu(self.model_target.conv3_1_2(h))
        h = self.model_target.conv3_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_2_1(h))
        h = F.relu(self.model_target.conv3_2_2(h))
        h = self.model_target.conv3_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_3_1(h))
        h = F.relu(self.model_target.conv3_3_2(h))
        h = self.model_target.conv3_3_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_4_1(h))
        h = F.relu(self.model_target.conv3_4_2(h))
        h = self.model_target.conv3_4_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_5_1(h))
        h = F.relu(self.model_target.conv3_5_2(h))
        h = self.model_target.conv3_5_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_6_1(h))
        h = F.relu(self.model_target.conv3_6_2(h))
        h = self.model_target.conv3_6_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_7_1(h))
        h = F.relu(self.model_target.conv3_7_2(h))
        h = self.model_target.conv3_7_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_8_1(h))
        h = F.relu(self.model_target.conv3_8_2(h))
        h = self.model_target.conv3_8_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model_target.conv4_1_ex(h)
        h = F.relu(self.model_target.conv4_1_1(h))
        h = F.relu(self.model_target.conv4_1_2(h))
        h = self.model_target.conv4_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_2_1(h))
        h = F.relu(self.model_target.conv4_2_2(h))
        h = self.model_target.conv4_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_3_1(h))
        h = F.relu(self.model_target.conv4_3_2(h))
        h = self.model_target.conv4_3_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_4_1(h))
        h = F.relu(self.model_target.conv4_4_2(h))
        h = self.model_target.conv4_4_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_5_1(h))
        h = F.relu(self.model_target.conv4_5_2(h))
        h = self.model_target.conv4_5_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_6_1(h))
        h = F.relu(self.model_target.conv4_6_2(h))
        h = self.model_target.conv4_6_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_7_1(h))
        h = F.relu(self.model_target.conv4_7_2(h))
        h = self.model_target.conv4_7_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_8_1(h))
        h = F.relu(self.model_target.conv4_8_2(h))
        h = self.model_target.conv4_8_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_9_1(h))
        h = F.relu(self.model_target.conv4_9_2(h))
        h = self.model_target.conv4_9_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_10_1(h))
        h = F.relu(self.model_target.conv4_10_2(h))
        h = self.model_target.conv4_10_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_11_1(h))
        h = F.relu(self.model_target.conv4_11_2(h))
        h = self.model_target.conv4_11_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_12_1(h))
        h = F.relu(self.model_target.conv4_12_2(h))
        h = self.model_target.conv4_12_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_13_1(h))
        h = F.relu(self.model_target.conv4_13_2(h))
        h = self.model_target.conv4_13_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_14_1(h))
        h = F.relu(self.model_target.conv4_14_2(h))
        h = self.model_target.conv4_14_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_15_1(h))
        h = F.relu(self.model_target.conv4_15_2(h))
        h = self.model_target.conv4_15_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_16_1(h))
        h = F.relu(self.model_target.conv4_16_2(h))
        h = self.model_target.conv4_16_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_17_1(h))
        h = F.relu(self.model_target.conv4_17_2(h))
        h = self.model_target.conv4_17_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_18_1(h))
        h = F.relu(self.model_target.conv4_18_2(h))
        h = self.model_target.conv4_18_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_19_1(h))
        h = F.relu(self.model_target.conv4_19_2(h))
        h = self.model_target.conv4_19_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_20_1(h))
        h = F.relu(self.model_target.conv4_20_2(h))
        h = self.model_target.conv4_20_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_21_1(h))
        h = F.relu(self.model_target.conv4_21_2(h))
        h = self.model_target.conv4_21_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_22_1(h))
        h = F.relu(self.model_target.conv4_22_2(h))
        h = self.model_target.conv4_22_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_23_1(h))
        h = F.relu(self.model_target.conv4_23_2(h))
        h = self.model_target.conv4_23_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_24_1(h))
        h = F.relu(self.model_target.conv4_24_2(h))
        h = self.model_target.conv4_24_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_25_1(h))
        h = F.relu(self.model_target.conv4_25_2(h))
        h = self.model_target.conv4_25_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_26_1(h))
        h = F.relu(self.model_target.conv4_26_2(h))
        h = self.model_target.conv4_26_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_27_1(h))
        h = F.relu(self.model_target.conv4_27_2(h))
        h = self.model_target.conv4_27_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_28_1(h))
        h = F.relu(self.model_target.conv4_28_2(h))
        h = self.model_target.conv4_28_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_29_1(h))
        h = F.relu(self.model_target.conv4_29_2(h))
        h = self.model_target.conv4_29_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_30_1(h))
        h = F.relu(self.model_target.conv4_30_2(h))
        h = self.model_target.conv4_30_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_31_1(h))
        h = F.relu(self.model_target.conv4_31_2(h))
        h = self.model_target.conv4_31_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_32_1(h))
        h = F.relu(self.model_target.conv4_32_2(h))
        h = self.model_target.conv4_32_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_33_1(h))
        h = F.relu(self.model_target.conv4_33_2(h))
        h = self.model_target.conv4_33_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_34_1(h))
        h = F.relu(self.model_target.conv4_34_2(h))
        h = self.model_target.conv4_34_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_35_1(h))
        h = F.relu(self.model_target.conv4_35_2(h))
        h = self.model_target.conv4_35_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_36_1(h))
        h = F.relu(self.model_target.conv4_36_2(h))
        h = self.model_target.conv4_36_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model_target.conv5_1_ex(h)
        h = F.relu(self.model_target.conv5_1_1(h))
        h = F.relu(self.model_target.conv5_1_2(h))
        h = self.model_target.conv5_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv5_2_1(h))
        h = F.relu(self.model_target.conv5_2_2(h))
        h = self.model_target.conv5_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv5_3_1(h))
        h = F.relu(self.model_target.conv5_3_2(h))
        h = self.model_target.conv5_3_3(h)
        h = F.relu(h + h_rem)

        h = F.average_pooling_2d(h, 7)
        Q = self.model_target.q_value(h)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action)

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100  #10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
        #	Initialization of Chainer 1.1.0 or older.
        #        print "CUDA init"
        #        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=convlstm_link.CONVLSTM(7056, 7056),
            l4=F.Linear(7056, 512, wscale=np.sqrt(2)),
            q_value=F.Linear(512,
                             self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 512),
                                               dtype=np.float32))).to_gpu()

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def reset_state(self):
        self.model.l2.reset_state()

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = Variable(
            cuda.to_gpu(
                np.zeros((self.replay_size, self.num_of_actions),
                         dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time, state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    '''
    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        #h2 = F.relu(self.model.l2(h1))
        #h3 = F.relu(self.model.l3(h2))
        h4 = F.relu(self.model.l4(h1))
        Q = self.model.q_value(h4)
        return Q

    def Q_func_target(self, state):
        h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        #h2 = F.relu(self.model_target.l2(h1))
        #h3 = F.relu(self.model_target.l3(h2))
        h4 = F.relu(self.model.l4(h1))
        Q = self.model_target.q_value(h4)
        return Q
    '''

    def Q_func(self, state):
        self.model.l1.reset_state()
        for i in range(4):
            h1 = F.relu(self.model.l1(state / 254.0))
            h4 = F.relu(self.model.l4(h1))
            Q = self.model.q_value(h4)
        return Q

    def Q_func_target(self, state):
        self.model_target.l1.reset_state()
        for i in range(4):
            h1 = F.relu(self.model_target.l1(state / 254.0))
            h4 = F.relu(self.model_target.l4(h1))
            Q = self.model_target.q_value(h4)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        #Q = self.Q_func(state)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5 #10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."

        print "Model Building"
        self.CNN_model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            )

        self.model = FunctionSet(
            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
            q_value=F.Linear(512, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 512),
                                               dtype=np.float32))
        ).to_gpu()
        
        d = 'elite/'
        
        self.CNN_model.l1.W.data = np.load(d+'l1_W.npy')#.astype(np.float32)
        self.CNN_model.l1.b.data = np.load(d+'l1_b.npy')#.astype(np.float32)
        self.CNN_model.l2.W.data = np.load(d+'l2_W.npy')#.astype(np.float32)
        self.CNN_model.l2.b.data = np.load(d+'l2_b.npy')#.astype(np.float32)
        self.CNN_model.l3.W.data = np.load(d+'l3_W.npy')#.astype(np.float32)
        self.CNN_model.l3.b.data = np.load(d+'l3_b.npy')#.astype(np.float32)

        self.CNN_model = self.CNN_model.to_gpu()
        self.CNN_model_target = copy.deepcopy(self.CNN_model)
        self.model_target = copy.deepcopy(self.model)


        
        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool),
                  np.zeros((self.data_size, 1), dtype=np.uint8)]
        


    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, lstm_reward, state_dash,
                        episode_end_flag, ale_reward):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[5][data_index] = ale_reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[3][data_index] = state_dash
            self.D[5][data_index] = ale_reward
        self.D[4][data_index] = episode_end_flag


    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.CNN_model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        #test now
        #print h3.data.shape
        Q = self.model.q_value(h4)
        return Q 

    
    def Q_func_LSTM(self, state):
        h1 = F.relu(self.CNN_model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        return h3.data.get()
    
        
    def Q_func_target(self, state):
        h1 = F.relu(self.CNN_model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model_target.l2(h1))
        h3 = F.relu(self.CNN_model_target.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        Q = self.model_target.q_value(h4)
        return Q
    
    def LSTM_reward(self, lstm_out, state_next):
        lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2))
        return lstm_reward

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #8
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
#	Initialization for Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)),
            l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)),
            l3=F.Linear(2592, 256),
            q_value=F.Linear(256, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32))
        ).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0, 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"

        return self.index_to_action(index_action), Q

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 50000  # 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10 ** 4  # Target update frequancy. original: 10^4
    data_size = 5 * (10 ** 5)  # Data size of history. original: 10^6

    field_num = 7
    field_size = 17

    def __init__(self, control_size=10, field_num=7, field_size=17):
        self.num_of_actions = control_size
        self.field_size = field_size

        # self.enable_controller = enable_controller  # Default setting : "Pong"


        print "Initializing DQN..."
        #	Initialization of Chainer 1.1.0 or older.
        # print "CUDA init"
        # cuda.init()

        self.field_num = field_num

        print "Model Building"
        self.model = FunctionSet(
                l1=F.Convolution2D(self.field_num * 4, 16, ksize=5, stride=1, nobias=False, wscale=np.sqrt(2)),
                l2=F.Convolution2D(16, 24, ksize=4, stride=1, nobias=False, wscale=np.sqrt(2)),
                l3=F.Linear(2400, 512, wscale=np.sqrt(2)),
                q_value=F.Linear(512, self.num_of_actions,
                                 initialW=np.zeros((self.num_of_actions, 512),
                                                   dtype=np.float32))
        ).to_gpu()

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        # self.optimizer.setup(self.model.collect_parameters())
        self.optimizer.setup(self.model)

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.float32),
                  np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

                # action_index = self.action_to_index(action[i])
            target[i, action[i]] = tmp_

        # TD-error clipping


        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        # td = Variable(target) - Q  # TD error


        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1)

        #print "td_data " + str(td_clip.data)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        # zero_val = Variable(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))



        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.field_num * 4, self.field_size, self.field_size),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.field_num * 4, self.field_size, self.field_size),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)
            ##修正なし



            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def Q_func_target(self, state):
        h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model_target.l2(h1))
        h3 = F.relu(self.model_target.l3(h2))
        Q = self.model_target.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return index_action, Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def save_model(self, model_name, opt_name):
        serializers.save_hdf5(model_name, self.model)
        serializers.save_hdf5(opt_name, self.optimizer)

    def read_model(self, model_name, opt_name):
        serializers.load_hdf5(model_name, self.model)
        serializers.load_hdf5(opt_name, self.optimizer)
        self.model_target = copy.deepcopy(self.model)
Example #10
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99                       # Discount factor
    initial_exploration = 5*10**4      # 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32                   # Replay (batch) size
    target_model_update_freq = 10**4   # Target update frequancy. original: 10^4
    data_size = 10**6                  # Data size of history. original: 10^6
    num_of_actions = 2                 # Action dimention
    num_of_states = 12                 # State dimention
    
    def __init__(self):
                  
        print "Initializing DQN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
#        self.model = FunctionSet(
#            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
#            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
#            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
#            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
#            q_value=F.Linear(512, self.num_of_actions,
#                             initialW=np.zeros((self.num_of_actions, 512),
#                                               dtype=np.float32))
#        ).to_gpu()
        
#        self.critic = FunctionSet(
#            l1=F.Linear(self.num_of_actions+self.num_of_states,512),
#            l2=F.Linear(512,256),
#            l3=F.Linear(256,128),
#            q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32))
#        ).to_gpu()
#        
#        self.actor = FunctionSet(
#            l1=F.Linear(self.num_of_states,512),
#            l2=F.Linear(512,256),
#            l3=F.Linear(256,128),
#            a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32))
#        ).to_gpu()
        
        self.critic = FunctionSet(
            l1=F.Linear(self.num_of_actions+self.num_of_states,1024),
            l2=F.Linear(1024,512),
            l3=F.Linear(512,256),
            l4=F.Linear(256,128),
            q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32))
        ).to_gpu()
        
        self.actor = FunctionSet(
            l1=F.Linear(self.num_of_states,1024),
            l2=F.Linear(1024,512),
            l3=F.Linear(512,256),
            l4=F.Linear(256,128),
            a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32))
        ).to_gpu()
        
#        self.critic = FunctionSet(
#            l1=F.Linear(self.num_of_actions+self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_actions+self.num_of_states)),
#            l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)),
#            l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)),
#            l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)),
#            q_value=F.Linear(128,1,wscale=0.01*math.sqrt(128))
#        ).to_gpu()
#        
#        self.actor = FunctionSet(
#            l1=F.Linear(self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_states)),
#            l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)),
#            l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)),
#            l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)),
#            a_value=F.Linear(128,self.num_of_actions,wscale=0.01*math.sqrt(128))
#        ).to_gpu()
        
        self.critic_target = copy.deepcopy(self.critic) 
        self.actor_target = copy.deepcopy(self.actor)
        
        print "Initizlizing Optimizer"
        #self.optim_critic = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001)
        #self.optim_actor = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optim_critic = optimizers.Adam(alpha=0.00001)
        self.optim_actor = optimizers.Adam(alpha=0.00001)
        self.optim_critic.setup(self.critic)
        self.optim_actor.setup(self.actor)
        
#        self.optim_critic.add_hook(chainer.optimizer.WeightDecay(0.00001))
#        self.optim_critic.add_hook(chainer.optimizer.GradientClipping(10))
#        self.optim_actor.add_hook(chainer.optimizer.WeightDecay(0.00001))
#        self.optim_actor.add_hook(chainer.optimizer.GradientClipping(10))

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.num_of_states), dtype=np.float32),
                  np.zeros((self.data_size, self.num_of_actions), dtype=np.float32),
                  np.zeros((self.data_size, 1), dtype=np.float32),
                  np.zeros((self.data_size, self.num_of_states), dtype=np.float32),
                  np.zeros((self.data_size, 1), dtype=np.bool)]
                  
#        with open('dqn_dump.json', 'a') as f:
#            json.dump(datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), f)
#            f.write('\n')
#            json.dump({"alpha": 0.00001, "beta1": 0.7, "beta2": 0.999, "weight_decay": 0.00001}, f)
#            f.write('\n')
#            f.close()
        #self.x_PID = Hover_PID_Controller(12.1, 1.25)
        #self.y_PID = Hover_PID_Controller(12.1, 1.25)

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        
        s = Variable(cuda.to_gpu(np.concatenate([state, action],1)))
        s_dash = Variable(cuda.to_gpu(state_dash))

        Q = self.Q_func(s)  # Get Q-value
        
        # Generate Target through target nets
        action_dash_tmp = self.A_func_target(s_dash) 
        action_dash = np.asanyarray(action_dash_tmp.data.get(), dtype=np.float32)
        tmp_dash = Variable(cuda.to_gpu(np.concatenate([state_dash, action_dash],1)))
        Q_dash_tmp = self.Q_func_target(tmp_dash)
        Q_dash = np.asanyarray(Q_dash_tmp.data.get(), dtype=np.float32)       
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = Reward[i] + self.gamma * Q_dash[i]
            else:
                tmp_ = Reward[i]

            target[i] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, 1), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        
        return loss, Q

    def updateActor(self, state):
        num_of_batch = state.shape[0]
        A_max = 1.0
        A_min = -1.0
        
        A = self.A_func(Variable(cuda.to_gpu(state)))
        tmp = Variable(cuda.to_gpu(np.concatenate([state, A.data.get()],1)))
        Q = self.Q_func(tmp)
        
        # Backward prop towards actor net
        #self.critic.zerograds()
        #self.actor.zerograds()
        Q.grad = cuda.to_gpu(np.ones((num_of_batch, 1), dtype=np.float32)*(-1.0))
#        Q.grad = Q.data*(-1.0)
        Q.backward()
        A.grad = tmp.grad[:,-self.num_of_actions:]
        print("sample_A.grad: "+str(A.grad[0]))
        for i in xrange(num_of_batch):
            for j in xrange(self.num_of_actions):
                if A.grad[i][j] < 0:
                    A.grad[i][j] *= (A_max-A.data[i][j])/(A_max-A_min)
                elif A.grad[i][j] > 0:
                    A.grad[i][j] *= (A.data[i][j]-A_min)/(A_max-A_min)
            
        A.backward()
        self.optim_actor.update()
        print("sample_A.grad: "+str(A.grad[0]))
        
    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))
                #reward_list = list(self.D[2])
                #replay_index = [i[0] for i in sorted(enumerate(reward_list),key=itemgetter(1),reverse=True)[:32]]
                #replay_index = np.asarray(replay_index).reshape(32,1)
                
            s_replay = np.ndarray(shape=(self.replay_size, self.num_of_states), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, self.num_of_actions), dtype=np.float32)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.num_of_states), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = np.asarray(self.D[1][replay_index[i]], dtype=np.float32)
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.asarray(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            #s_replay = cuda.to_gpu(s_replay)
            #s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based critic update
            self.optim_critic.zero_grads()
            loss, q = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optim_critic.update()
            
            # Update the actor
            self.optim_critic.zero_grads()
            self.optim_actor.zero_grads()
            self.updateActor(s_replay)
            
            self.soft_target_model_update()
            
            print "AVG_Q %f" %(np.average(q.data.get()))
            print("loss " + str(loss.data))
            
#            with open('dqn_dump.json', 'a') as f:
#                json.dump({"time": time, "avg_Q": float(np.average(q.data.get())), "loss": float(loss.data)}, f)
#                f.write('\n')
#                f.close()

    def Q_func(self, state):
#        h1 = F.relu(self.critic.l1(state))
#        h2 = F.relu(self.critic.l2(h1))
#        h3 = F.relu(self.critic.l3(h2))
#        Q = self.critic.q_value(h3)
        h1 = F.relu(self.critic.l1(state))
        h2 = F.relu(self.critic.l2(h1))
        h3 = F.relu(self.critic.l3(h2))
        h4 = F.relu(self.critic.l4(h3))
        Q = self.critic.q_value(h4)
        return Q

    def Q_func_target(self, state):
#        h1 = F.relu(self.critic_target.l1(state))
#        h2 = F.relu(self.critic_target.l2(h1))
#        h3 = F.relu(self.critic.l3(h2))
#        Q = self.critic_target.q_value(h3)   
        h1 = F.relu(self.critic_target.l1(state))
        h2 = F.relu(self.critic_target.l2(h1))
        h3 = F.relu(self.critic_target.l3(h2))
        h4 = F.relu(self.critic.l4(h3))
        Q = self.critic_target.q_value(h4)
        return Q
        
    def A_func(self, state):
#        h1 = F.relu(self.actor.l1(state))
#        h2 = F.relu(self.actor.l2(h1))
#        h3 = F.relu(self.actor.l3(h2))
#        A = self.actor.a_value(h3)
        h1 = F.relu(self.actor.l1(state))
        h2 = F.relu(self.actor.l2(h1))
        h3 = F.relu(self.actor.l3(h2))
        h4 = F.relu(self.actor.l4(h3))
        A = self.actor.a_value(h4)
        return A

    def A_func_target(self, state):
#        h1 = F.relu(self.actor_target.l1(state))
#        h2 = F.relu(self.actor_target.l2(h1))
#        h3 = F.relu(self.actor.l3(h2))
#        A = self.actor_target.a_value(h3)
        h1 = F.relu(self.actor_target.l1(state))
        h2 = F.relu(self.actor_target.l2(h1))
        h3 = F.relu(self.actor_target.l3(h2))
        h4 = F.relu(self.actor.l4(h3))
        A = self.actor_target.a_value(h4)
        return A

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        A = self.A_func(s)
        A = A.data
        if np.random.rand() < epsilon:
            action = np.random.uniform(-1.,1.,(1,self.num_of_actions)).astype(np.float32)
#            action = np.zeros((1,self.num_of_actions),dtype=np.float32)
#            if state[0,0] > 0:
#                action[0,0] = np.random.uniform(0.0,0.5)
#            elif state[0,0] < 0:
#                action[0,0] = np.random.uniform(-0.5,0.0)                
#            if state[0,1] < 0:            
#                action[0,1] = np.random.uniform(0.0,0.5)
#            elif state[0,1] > 0:
#                action[0,1] = np.random.uniform(-0.5,0.0)
            #print("teststate"+str(state))
            #action[0,0] = -self.x_PID.getCorrection(state[0][0], 0.0)
            #action[0,1] = self.y_PID.getCorrection(state[0][1], 0.0)
            print "RANDOM"
        else:
            action = A.get()
            print "GREEDY"
            #print(str(action))
        return action

    def hard_target_model_update(self):
        self.critic_target = copy.deepcopy(self.critic)
        self.actor_target = copy.deepcopy(self.actor)

    def soft_target_model_update(self, tau=0.001):
        self.critic_target.l1.W.data = tau*self.critic.l1.W.data + (1-tau)*self.critic_target.l1.W.data
        self.critic_target.l2.W.data = tau*self.critic.l2.W.data + (1-tau)*self.critic_target.l2.W.data
        self.critic_target.l3.W.data = tau*self.critic.l3.W.data + (1-tau)*self.critic_target.l3.W.data
        self.critic_target.l4.W.data = tau*self.critic.l4.W.data + (1-tau)*self.critic_target.l4.W.data
        self.critic_target.q_value.W.data = tau*self.critic.q_value.W.data + (1-tau)*self.critic_target.q_value.W.data
        self.actor_target.l1.W.data = tau*self.actor.l1.W.data + (1-tau)*self.actor_target.l1.W.data
        self.actor_target.l2.W.data = tau*self.actor.l2.W.data + (1-tau)*self.actor_target.l2.W.data
        self.actor_target.l3.W.data = tau*self.actor.l3.W.data + (1-tau)*self.actor_target.l3.W.data
        self.actor_target.l4.W.data = tau*self.actor.l4.W.data + (1-tau)*self.actor_target.l4.W.data
        self.actor_target.a_value.W.data = tau*self.actor.a_value.W.data + (1-tau)*self.actor_target.a_value.W.data
Example #11
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  # original: 4

    def __init__(self, use_gpu, enable_controller, dim, epsilon, epsilon_delta, min_eps):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim
        self.epsilon = epsilon
        self.epsilon_delta = epsilon_delta
        self.min_eps = min_eps
        self.time = 0

        app_logger.info("Initializing Q-Network...")

        hidden_dim = 256
        self.model = FunctionSet(
            l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)),
            q_value=F.Linear(hidden_dim, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, hidden_dim),
                                               dtype=np.float32))
        )
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        q = self.model.q_value(h4)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        q = self.model_target.q_value(h4)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            app_logger.info(" Random")
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            app_logger.info("#Greedy")
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)

    def start(self, feature):
        self.state = np.zeros((self.hist_size, self.dim), dtype=np.uint8)
        self.state[0] = feature

        state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Generate an Action e-greedy
        action, q_now = self.e_greedy(state_, self.epsilon)
        return_action = action

        return return_action

    def update_model(self, replayed_experience):
        if replayed_experience[0]:
            self.optimizer.zero_grads()
            loss, _ = self.forward(replayed_experience[1], replayed_experience[2],
                                        replayed_experience[3], replayed_experience[4], replayed_experience[5])
            loss.backward()
            self.optimizer.update()

        # Target model update
        if replayed_experience[0] and np.mod(self.time, self.target_model_update_freq) == 0:
            app_logger.info("Model Updated")
            self.target_model_update()

        self.time += 1
        app_logger.info("step: {}".format(self.time))

    def step(self, features):
        if self.hist_size == 4:
            self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], features], dtype=np.uint8)
        elif self.hist_size == 2:
            self.state = np.asanyarray([self.state[1], features], dtype=np.uint8)
        elif self.hist_size == 1:
            self.state = np.asanyarray([features], dtype=np.uint8)
        else:
            app_logger.error("self.DQN.hist_size err")

        state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.initial_exploration < self.time:
            self.epsilon -= self.epsilon_delta
            if self.epsilon < self.min_eps:
                self.epsilon = self.min_eps
            eps = self.epsilon
        else:  # Initial Exploation Phase
            app_logger.info("Initial Exploration : {}/{} steps".format(self.time, self.initial_exploration))
            eps = 1.0

        # Generate an Action by e-greedy action selection
        action, q_now = self.e_greedy(state_, eps)

        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        return action, eps, q_max
Example #12
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100  #10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 30000  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4,
                               32,
                               ksize=8,
                               stride=4,
                               nobias=False,
                               wscale=np.sqrt(2)),
            l2=F.Convolution2D(32,
                               64,
                               ksize=4,
                               stride=2,
                               nobias=False,
                               wscale=np.sqrt(2)),
            l3=F.Convolution2D(64,
                               64,
                               ksize=3,
                               stride=1,
                               nobias=False,
                               wscale=np.sqrt(2)),
            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
            q_value=F.Linear(512,
                             self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 512),
                                               dtype=np.float32)))  #.to_gpu()
        self.model.l1.W = np.load('elite/l1_W.npy')
        self.model.l1.b = np.load('elite/l1_b.npy')
        self.model.l2.W = np.load('elite/l2_W.npy')
        self.model.l2.b = np.load('elite/l2_b.npy')
        self.model.l3.W = np.load('elite/l3_W.npy')
        self.model.l3.b = np.load('elite/l3_b.npy')
        self.model.l4.W = np.load('elite/l4_W.npy')
        self.model.l4.b = np.load('elite/l4_b.npy')
        self.model.q_value.W = np.load('elite/q_value_W.npy')
        self.model.q_value.b = np.load('elite/q_value_b.npy')

        self.model_target = copy.deepcopy(self.model)

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(target) - Q
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = Variable(
            np.zeros((self.replay_size, self.num_of_actions),
                     dtype=np.float32))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time, state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            # Gradient-based update

            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)

    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        Q = self.model.q_value(h4)
        return Q

    def Q_func_target(self, state):
        h1 = F.relu(self.model_target.l1(state /
                                         254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model_target.l2(h1))
        h3 = F.relu(self.model_target.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        Q = self.model_target.q_value(h4)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            #index_action = np.argmax(Q.get())
            index_action = np.argmax(Q)
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #13
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**2  # Target update frequancy. original
    data_size = 10**5  # Data size of history. original
     
    #actions are 0 => do nothing, 1 -> buy, -1 sell
    def __init__(self, input_vector_length,enable_controller=[0, 1, 2]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"
        self.input_vector_length = input_vector_length

        print "Initializing DQN..."
#   Initialization for Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()
        
        #inputs --> 5 * 14 (with 10 temporality) + 5 (of last one hour) + 5 (of last 24 hour)
        print "Model Building"
        self.model = FunctionSet(
            l1=F.Linear(input_vector_length, 500),
            l2=F.Linear(500, 250),
            l3=F.Linear(250, 80),
            q_value=F.Linear(80, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 80),
                                               dtype=np.float32))
        ).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        #todo might want to normalize input, but for now I will do that outside this class 
        h1 = F.relu(self.model.l1(state))  
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"

        return self.index_to_action(index_action), Q
    
    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)
    
    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)