class CNN_class:
    def __init__(self):
        self.model = FunctionSet(l1=F.Convolution2D(4,
                                                    32,
                                                    ksize=8,
                                                    stride=4,
                                                    nobias=False,
                                                    wscale=np.sqrt(2)),
                                 l2=F.Convolution2D(32,
                                                    64,
                                                    ksize=4,
                                                    stride=2,
                                                    nobias=False,
                                                    wscale=np.sqrt(2)),
                                 l3=F.Convolution2D(64,
                                                    64,
                                                    ksize=3,
                                                    stride=1,
                                                    nobias=False,
                                                    wscale=np.sqrt(2)))

        self.model.l1.W = np.load('elite/l1_W.npy')
        self.model.l1.b = np.load('elite/l1_b.npy')
        self.model.l2.W = np.load('elite/l2_W.npy')
        self.model.l2.b = np.load('elite/l2_b.npy')
        self.model.l3.W = np.load('elite/l3_W.npy')
        self.model.l3.b = np.load('elite/l3_b.npy')

    def CNN_forward(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))

        return h3
Example #2
0
class Model1:

    def __init__(self, model):
        if isinstance(model, tuple):
            input_dims, n_units, output_dims = model
            self.model = FunctionSet(l1=F.Linear(input_dims, n_units),
                                     l2=F.Linear(n_units, n_units),
                                     l3=F.Linear(n_units, output_dims))
        else:
            self.model = model

    def __call__(self):
        return self.model

    # Neural net architecture
    # ニューラルネットの構造
    def forward(self, x_data, y_data, train=True):
        x = Variable(x_data)
        if not y_data is None: t = Variable(y_data)
        h1 = F.dropout(F.relu(self.model.l1(x)),  train=train)
        h2 = F.dropout(F.relu(self.model.l2(h1)), train=train)
        y  = self.model.l3(h2)
        if not y_data is None:
            # 多クラス分類なので誤差関数としてソフトマックス関数の
            # 交差エントロピー関数を用いて、誤差を導出
            return F.softmax_cross_entropy(y, t), F.accuracy(y, t), y
        else:
            return y

    def evaluate(self, x_data):
        return self.forward(x_data, None, train=False)
class DeepLearning:
    def __init__(self, input_size, hidden_size, output_size):
        self.model = FunctionSet(l1=F.Linear(input_size, hidden_size),
                    l2=F.Linear(hidden_size, hidden_size),
                    l3=F.Linear(hidden_size, output_size))
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())



    def batch(self, X_train, y_train, batch_size, perm):
        train_size = X_train.shape[0]

        for i in xrange(0, train_size, batch_size):
            X_batch = X_train[perm[i: i+batch_size]]
            y_batch = y_train[perm[i: i+batch_size]]

            # Chainer用に型変換
            x = Variable(X_batch)
            t = Variable(y_batch)

            self.optimizer.zero_grads()
            y = self.forward(x)  # 予測結果

            loss = F.softmax_cross_entropy(y, t)
            loss.backward()

            self.optimizer.update()


    def forward(self, x, train=True):
        h1 = F.dropout(F.sigmoid(self.model.l1(x)),  train=train)
        h2 = F.dropout(F.sigmoid(self.model.l2(h1)), train=train)
        return self.model.l3(h2)


    def predicate(self, x_data):
        x = np.array([x_data], dtype=np.float32)
        x = Variable(x)
        y = self.forward(x, train=False)
        return np.argmax(y.data)


    def save(self, fpath):
        pickle.dump(self.model, open(fpath, 'wb'), -1)


    def load(self, fpath):
        self.model = pickle.load(open(fpath,'rb'))
class DeepLearning:
    def __init__(self, input_size, hidden_size, output_size):
        self.model = FunctionSet(l1=F.Linear(input_size, hidden_size),
                                 l2=F.Linear(hidden_size, hidden_size),
                                 l3=F.Linear(hidden_size, output_size))
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())

    def batch(self, X_train, y_train, batch_size, perm):
        train_size = X_train.shape[0]

        for i in xrange(0, train_size, batch_size):
            X_batch = X_train[perm[i:i + batch_size]]
            y_batch = y_train[perm[i:i + batch_size]]

            # Chainer用に型変換
            x = Variable(X_batch)
            t = Variable(y_batch)

            self.optimizer.zero_grads()
            y = self.forward(x)  # 予測結果

            loss = F.softmax_cross_entropy(y, t)
            loss.backward()

            self.optimizer.update()

    def forward(self, x, train=True):
        h1 = F.dropout(F.sigmoid(self.model.l1(x)), train=train)
        h2 = F.dropout(F.sigmoid(self.model.l2(h1)), train=train)
        return self.model.l3(h2)

    def predicate(self, x_data):
        x = np.array([x_data], dtype=np.float32)
        x = Variable(x)
        y = self.forward(x, train=False)
        return np.argmax(y.data)

    def save(self, fpath):
        pickle.dump(self.model, open(fpath, 'wb'), -1)

    def load(self, fpath):
        self.model = pickle.load(open(fpath, 'rb'))
class NN3_Model(ModelBase):
    def __init__(self, input_dim=748, n_units=1000):
        super(NN3_Model, self).__init__()
        self.n_units = n_units
        self.model = FunctionSet(l1=F.Linear(input_dim, n_units),
                                 l2=F.Linear(n_units, n_units),
                                 l3=F.Linear(n_units, 2))

    def forward(self, x_data, y_data, train=True):
        u"""return loss, accuracy"""
        x, t = Variable(x_data), Variable(y_data)
        h1 = F.dropout(F.relu(self.model.l1(x)), train=train)
        h2 = F.dropout(F.relu(self.model.l2(h1)), train=train)
        y = self.model.l3(h2)
        # 多クラス分類なので誤差関数としてソフトマックス関数の
        # 交差エントロピー関数を用いて、誤差を導出。最低でもlossは必要
        return {
            "loss": F.softmax_cross_entropy(y, t),
            "accuracy": F.accuracy(y, t)
        }
class NN3_Model(ModelBase):
    def __init__(self, input_dim=748, n_units=1000):
        super(NN3_Model, self).__init__()
        self.n_units = n_units
        self.model = FunctionSet(l1=F.Linear(input_dim, n_units),
                                 l2=F.Linear(n_units, n_units),
                                 l3=F.Linear(n_units, 2))

    def forward(self, x_data, y_data, train=True):
        u"""return loss, accuracy"""
        x, t = Variable(x_data), Variable(y_data)
        h1 = F.dropout(F.relu(self.model.l1(x)),  train=train)
        h2 = F.dropout(F.relu(self.model.l2(h1)), train=train)
        y  = self.model.l3(h2)
        # 多クラス分類なので誤差関数としてソフトマックス関数の
        # 交差エントロピー関数を用いて、誤差を導出。最低でもlossは必要
        return {
                "loss": F.softmax_cross_entropy(y, t),
                "accuracy": F.accuracy(y, t)
                }
class CNN_class:
    def __init__(self):
        self.model = FunctionSet(
            l1 = F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2 = F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3 = F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2))
)
        
        self.model.l1.W = np.load('elite/l1_W.npy')
        self.model.l1.b = np.load('elite/l1_b.npy')
        self.model.l2.W = np.load('elite/l2_W.npy')
        self.model.l2.b = np.load('elite/l2_b.npy')
        self.model.l3.W = np.load('elite/l3_W.npy')
        self.model.l3.b = np.load('elite/l3_b.npy')


    def CNN_forward(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))

        return h3
Example #8
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
#	Initialization for Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)),
            l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)),
            l3=F.Linear(2592, 256),
            q_value=F.Linear(256, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32))
        ).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0, 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"

        return self.index_to_action(index_action), Q

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #9
0
class MLP(object):
	def __init__(
		self,
		data,
		target,
		n_inputs=784,
		n_hidden=784,
		n_outputs=10,
		gpu=-1
	):

		self.model = FunctionSet(
			l1=F.Linear(n_inputs, n_hidden),
			l2=F.Linear(n_hidden, n_hidden),
			l3=F.Linear(n_hidden, n_outputs)
		)

		if gpu >= 0:
			self.model.to_gpu()

		self.x_train, self.x_test = data
		self.y_train, self.y_test = target

		self.n_train = len(self.y_train)
		self.n_test = len(self.y_test)

		self.gpu = gpu
		self.optimizer = optimizers.Adam()
		self.optimizer.setup(self.model)

		self.train_accuracies = []
		self.train_losses = []
		self.test_accuracies = []
		self.test_losses = []

	@property
	def xp(self):
		return cuda.cupy if self.gpu >= 0 else numpy

	def forward(self, x_data, y_data, train=True):
		x, t = Variable(x_data), Variable(y_data)
		h1 = F.dropout(F.relu(self.model.l1(x)), train=train)
		h2 = F.dropout(F.relu(self.model.l2(h1)), train=train)
		y = self.model.l3(h2)
		return F.softmax_cross_entropy(y, t), F.accuracy(y, t)

	def train_and_test(self, n_epoch=20, batchsize=100):
		for epoch in xrange(1, n_epoch + 1):
			logging.info('epoch {}'.format(epoch))

			perm = numpy.random.permutation(self.n_train)
			sum_accuracy = 0
			sum_loss = 0
			for i in xrange(0, self.n_train, batchsize):
				x_batch = self.xp.asarray(self.x_train[perm[i:i+batchsize]])
				y_batch = self.xp.asarray(self.y_train[perm[i:i+batchsize]])

				real_batchsize = len(x_batch)

				self.optimizer.zero_grads()
				loss, acc = self.forward(x_batch, y_batch)
				loss.backward()
				self.optimizer.update()

				sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize
				sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize

			self.train_accuracies.append(sum_accuracy / self.n_train)
			self.train_losses.append(sum_loss / self.n_train)

			logging.info(
				'train mean loss={}, accuracy={}'.format(
					sum_loss / self.n_train,
					sum_accuracy / self.n_train
				)
			)

			# evalation
			sum_accuracy = 0
			sum_loss = 0
			for i in xrange(0, self.n_test, batchsize):
				x_batch = self.xp.asarray(self.x_test[i:i+batchsize])
				y_batch = self.xp.asarray(self.y_test[i:i+batchsize])

				real_batchsize = len(x_batch)

				loss, acc = self.forward(x_batch, y_batch, train=False)

				sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize
				sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize

			self.test_accuracies.append(sum_accuracy / self.n_test)
			self.test_accuracies.append(sum_loss / self.n_test)
			logging.info(
				'test mean loss={}, accuracy={}'.format(
					sum_loss / self.n_test,
					sum_accuracy / self.n_test
				)
			)
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 50000  # 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10 ** 4  # Target update frequancy. original: 10^4
    data_size = 5 * (10 ** 5)  # Data size of history. original: 10^6

    field_num = 7
    field_size = 17

    def __init__(self, control_size=10, field_num=7, field_size=17):
        self.num_of_actions = control_size
        self.field_size = field_size

        # self.enable_controller = enable_controller  # Default setting : "Pong"


        print "Initializing DQN..."
        #	Initialization of Chainer 1.1.0 or older.
        # print "CUDA init"
        # cuda.init()

        self.field_num = field_num

        print "Model Building"
        self.model = FunctionSet(
                l1=F.Convolution2D(self.field_num * 4, 16, ksize=5, stride=1, nobias=False, wscale=np.sqrt(2)),
                l2=F.Convolution2D(16, 24, ksize=4, stride=1, nobias=False, wscale=np.sqrt(2)),
                l3=F.Linear(2400, 512, wscale=np.sqrt(2)),
                q_value=F.Linear(512, self.num_of_actions,
                                 initialW=np.zeros((self.num_of_actions, 512),
                                                   dtype=np.float32))
        ).to_gpu()

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        # self.optimizer.setup(self.model.collect_parameters())
        self.optimizer.setup(self.model)

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.float32),
                  np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

                # action_index = self.action_to_index(action[i])
            target[i, action[i]] = tmp_

        # TD-error clipping


        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        # td = Variable(target) - Q  # TD error


        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1)

        #print "td_data " + str(td_clip.data)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        # zero_val = Variable(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))



        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.field_num * 4, self.field_size, self.field_size),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.field_num * 4, self.field_size, self.field_size),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)
            ##修正なし



            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def Q_func_target(self, state):
        h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model_target.l2(h1))
        h3 = F.relu(self.model_target.l3(h2))
        Q = self.model_target.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return index_action, Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def save_model(self, model_name, opt_name):
        serializers.save_hdf5(model_name, self.model)
        serializers.save_hdf5(opt_name, self.optimizer)

    def read_model(self, model_name, opt_name):
        serializers.load_hdf5(model_name, self.model)
        serializers.load_hdf5(opt_name, self.optimizer)
        self.model_target = copy.deepcopy(self.model)
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5 #10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."

        print "Model Building"
        self.CNN_model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            )

        self.model = FunctionSet(
            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
            q_value=F.Linear(512, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 512),
                                               dtype=np.float32))
        ).to_gpu()
        
        d = 'elite/'
        
        self.CNN_model.l1.W.data = np.load(d+'l1_W.npy')#.astype(np.float32)
        self.CNN_model.l1.b.data = np.load(d+'l1_b.npy')#.astype(np.float32)
        self.CNN_model.l2.W.data = np.load(d+'l2_W.npy')#.astype(np.float32)
        self.CNN_model.l2.b.data = np.load(d+'l2_b.npy')#.astype(np.float32)
        self.CNN_model.l3.W.data = np.load(d+'l3_W.npy')#.astype(np.float32)
        self.CNN_model.l3.b.data = np.load(d+'l3_b.npy')#.astype(np.float32)

        self.CNN_model = self.CNN_model.to_gpu()
        self.CNN_model_target = copy.deepcopy(self.CNN_model)
        self.model_target = copy.deepcopy(self.model)


        
        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool),
                  np.zeros((self.data_size, 1), dtype=np.uint8)]
        


    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, lstm_reward, state_dash,
                        episode_end_flag, ale_reward):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[5][data_index] = ale_reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[3][data_index] = state_dash
            self.D[5][data_index] = ale_reward
        self.D[4][data_index] = episode_end_flag


    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.CNN_model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        #test now
        #print h3.data.shape
        Q = self.model.q_value(h4)
        return Q 

    
    def Q_func_LSTM(self, state):
        h1 = F.relu(self.CNN_model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        return h3.data.get()
    
        
    def Q_func_target(self, state):
        h1 = F.relu(self.CNN_model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model_target.l2(h1))
        h3 = F.relu(self.CNN_model_target.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        Q = self.model_target.q_value(h4)
        return Q
    
    def LSTM_reward(self, lstm_out, state_next):
        lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2))
        return lstm_reward

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #12
0
print y.data

y.grad = np.ones((2, 2), dtype=np.float32)  # have to set init grad values

y.backward()
print f.gW, f.gb
y.backward()
print f.gW, f.gb

f.gW.fill(0), f.gb.fill(0)  # have to fill
y.backward()
print f.gW, f.gb

# function set
print "# function set"
model = FunctionSet(
    l1=F.Linear(4, 3),
    l2=F.Linear(3, 2),
)
print model
model.l3 = F.Linear(2, 2)

x = Variable(np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.float32))
h1 = model.l1(x)
h2 = model.l2(h1)
h3 = model.l3(h2)

# have to set init grad values and fill grads, which will be done using optimizer
print model.parameters
print model.gradients
Example #13
0
class MLP(Base):
    def __init__(self,
                 data=None,
                 target=None,
                 n_inputs=784,
                 n_hidden=784,
                 n_outputs=10,
                 gpu=-1):
        self.excludes.append('xp')
        self.model = FunctionSet(l1=F.Linear(n_inputs, n_hidden),
                                 l2=F.Linear(n_hidden, n_hidden),
                                 l3=F.Linear(n_hidden, n_outputs))

        if gpu >= 0:
            self.model.to_gpu()
            self.xp = cuda.cupy
        else:
            self.xp = np

        if not data is None:
            self.x_train, self.x_test = data
        else:
            self.x_train, self.y_test = None, None

        if not target is None:
            self.y_train, self.y_test = target
            self.n_train = len(self.y_train)
            self.n_test = len(self.y_test)
        else:
            self.y_train, self.y_test = None, None
            self.n_train = 0
            self.n_test = 0

        self.gpu = gpu
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model)

    def forward(self, x_data, y_data, train=True):
        x, t = Variable(x_data), Variable(y_data)
        h1 = F.dropout(F.relu(self.model.l1(x)), train=train)
        h2 = F.dropout(F.relu(self.model.l2(h1)), train=train)
        y = self.model.l3(h2)
        return F.softmax_cross_entropy(y, t), F.accuracy(y, t)

    def train_and_test(self, n_epoch=20, batchsize=100):
        for epoch in xrange(1, n_epoch + 1):
            print 'epoch', epoch

            perm = np.random.permutation(self.n_train)
            sum_accuracy = 0
            sum_loss = 0
            for i in xrange(0, self.n_train, batchsize):
                x_batch = self.xp.asarray(self.x_train[perm[i:i + batchsize]])
                y_batch = self.xp.asarray(self.y_train[perm[i:i + batchsize]])

                real_batchsize = len(x_batch)

                self.optimizer.zero_grads()
                loss, acc = self.forward(x_batch, y_batch)
                loss.backward()
                self.optimizer.update()

                sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize
                sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize

            print 'train mean loss={}, accuracy={}'.format(
                sum_loss / self.n_train, sum_accuracy / self.n_train)

            # evalation
            sum_accuracy = 0
            sum_loss = 0
            for i in xrange(0, self.n_test, batchsize):
                x_batch = self.xp.asarray(self.x_test[i:i + batchsize])
                y_batch = self.xp.asarray(self.y_test[i:i + batchsize])

                real_batchsize = len(x_batch)

                loss, acc = self.forward(x_batch, y_batch, train=False)

                sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize
                sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize

            print 'test mean loss={}, accuracy={}'.format(
                sum_loss / self.n_test, sum_accuracy / self.n_test)
class DQN_class:
	gamma = 0.99
	initial_exploration = 10**2
	replay_size = 32  # Replay (batch) size
	target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
	data_size = 10**2

	def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8]):
		#	"""	[ 0, 0],
		#		[ 0, 1],
		#		[ 0,-1],
		#		[ 1, 0],
		#		[ 1, 1],
		#		[ 1,-1],
		#		[-1, 0],
		#		[-1, 1],
		#		[-1,-1]]):"""
		self.num_of_actions = len(enable_controller)
		self.enable_controller = enable_controller

		print "Initializing DQN..."
		print "CUDA init"
		#cuda.init()

		print "Model Building"
		self.model = FunctionSet(
			l1 = F.Linear(INPUT_SIZE, 5000),	# input map[100, 100] + v[2] + w[1] + wp[2]
			l2 = F.Linear(5000, 1000),
			l3 = F.Linear(1000, 100),
			l4 = F.Linear(100, self.num_of_actions,
						initialW=np.zeros((self.num_of_actions, 100), dtype=np.float32))
		).to_gpu()

		self.model_target = copy.deepcopy(self.model)
		
		print "Initizlizing Optimizer"
		self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)	### 重要!!!!  RMSProp!!
		self.optimizer.setup(self.model.collect_parameters())

		# History Data :  D=[s, a, r, s_dash, end_episode_flag]
		self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32),
				  np.zeros(self.data_size, dtype=np.uint8),
				  np.zeros((self.data_size, 1), dtype=np.float32),
				  np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32),
				  np.zeros((self.data_size, 1), dtype=np.bool)]
		#self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8),
		#		  np.zeros(self.data_size, dtype=np.uint8),
		#		  np.zeros((self.data_size, 1), dtype=np.int8),
		#		  np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8),
		#		  np.zeros((self.data_size, 1), dtype=np.bool)]

	def forward(self, state, action, Reward, state_dash, episode_end):
		num_of_batch = state.shape[0]
		s = Variable(state)
		s_dash = Variable(state_dash)

		Q = self.Q_func(s)  # Get Q-value

		# Generate Target Signals
		tmp = self.Q_func_target(s_dash)  # Q(s',*)
		tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
		max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
		target = np.asanyarray(Q.data.get(), dtype=np.float32)

		for i in xrange(num_of_batch):
			if not episode_end[i][0]:
				tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
			else:
				tmp_ = np.sign(Reward[i])

			#action_index = self.action_to_index(action[i])
			#target[i, action_index] = tmp_
			target[i, action[i]] = tmp_

		# TD-error clipping
		td = Variable(cuda.to_gpu(target)) - Q  # TD error
		#print "td-error"
		print "np.max(td.data) : ",
		print np.max(td.data.get())
		# 何のためにあるのか不明	td = td_clipとなっている
		td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
		td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)
		#print "td_clip.data :",
		#print td_clip.data

		zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))).astype(np.float32))
		#zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))))
		loss = F.mean_squared_error(td_clip, zero_val)
		return loss, Q

	# Dataを保存
	def stockExperience(self, time,
						state, action, reward, state_dash,
						episode_end_flag):
		data_index = time % self.data_size

		if episode_end_flag is True:
			self.D[0][data_index] = state
			self.D[1][data_index] = action
			self.D[2][data_index] = reward
		else:
			self.D[0][data_index] = state
			self.D[1][data_index] = action
			self.D[2][data_index] = reward
			self.D[3][data_index] = state_dash
		self.D[4][data_index] = episode_end_flag

	# mini batch学習
	def experienceReplay(self, time):
		if self.initial_exploration < time:
			# Pick up replay_size number of samples from the Data
			if time < self.data_size:  # during the first sweep of the History Data
				replay_index = np.random.randint(0, time, (self.replay_size, 1))
			else:
				replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

			#s_replay = np.ndarray(shape=(self.replay_size, 100, 100), dtype=np.float32)
			s_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32)
			a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
			r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
			s_dash_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32)
			episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
			for i in xrange(self.replay_size):
				s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
				a_replay[i] = self.D[1][replay_index[i]]
				r_replay[i] = self.D[2][replay_index[i]]
				s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
				episode_end_replay[i] = self.D[4][replay_index[i]]

			s_replay = cuda.to_gpu(s_replay)
			s_dash_replay = cuda.to_gpu(s_dash_replay)

			# Gradient-based update
			self.optimizer.zero_grads()
			loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
			loss.backward()	### 逆伝播
			self.optimizer.update()	### 学習!!!!!!!!!, ネットワークの更新

	def Q_func(self, state):
		h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs into [0.0 1.0]
		h2 = F.relu(self.model.l2(h1))
		h3 = F.relu(self.model.l3(h2))
		Q = self.model.l4(h3)
		return Q

	def Q_func_target(self, state):
		h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs into [0.0 1.0]
		h2 = F.relu(self.model_target.l2(h1))
		h3 = F.relu(self.model_target.l3(h2))
		Q = self.model.l4(h3)
		return Q

	def e_greedy(self, state, epsilon):
		s = Variable(state)
		Q = self.Q_func(s)
		Q = Q.data

		if np.random.rand() < epsilon:
			#index_action = np.random.randint(0, self.num_of_actions)
			action = np.random.randint(0, self.num_of_actions)
			print "RANDOM"
		else:
			#index_action = np.argmax(Q.get())
			action = np.argmax(Q.get())
			print "GREEDY"
		#return self.index_to_action(index_action), Q
		return action, Q

	def action_to_vec(self, action, vec):
		#	"""	[ 0, 0],
		#		[ 0, 1],
		#		[ 0,-1],
		#		[ 1, 0],
		#		[ 1, 1],
		#		[ 1,-1],
		#		[-1, 0],
		#		[-1, 1],
		#		[-1,-1]]):"""
		#vec = Twist()
		if action == 3 or action == 4 or action == 5:
			vec.linear.x += 0.1
		elif action == 6 or action == 7 or action == 8:
			vec.linear.x -= 0.1

		if action == 1 or action == 4 or action == 7:
			vec.angular.z += 0.1
		elif action == 2 or action == 5 or action == 8:
			vec.angular.z -= 0.1

		if vec.linear.x > 1:
			vec.linear.x = 1
		elif vec.linear.x < -1:
			vec.linear.x = -1

		if vec.angular.z > 1:
			vec.angular.z = 1
		elif vec.angular.z < -1:
			vec.angular.z = -1

		return vec
Example #15
0
class SDA(object):
	def __init__(
		self,
		rng,
		data,
		target,
		n_inputs=784,
		n_hidden=[784,784,784],
		n_outputs=10,
		corruption_levels=[0.1,0.2,0.3],
		gpu=-1):

		self.model = FunctionSet(
			l1=F.Linear(n_inputs, n_hidden[0]),
			l2=F.Linear(n_hidden[0], n_hidden[1]),
			l3=F.Linear(n_hidden[1], n_hidden[2]),
			l4=F.Linear(n_hidden[2], n_outputs)
		)

		if gpu >= 0:
			self.model.to_gpu()

		self.rng = rng
		self.gpu = gpu
		self.data = data
		self.target = target

		self.x_train, self.x_test = data
		self.y_train, self.y_test = target

		self.n_train = len(self.y_train)
		self.n_test = len(self.y_test)

		self.corruption_levels = corruption_levels
		self.n_inputs = n_inputs
		self.n_hidden = n_hidden
		self.n_outputs = n_outputs

		self.dae1 = None
		self.dae2 = None
		self.dae3 = None
		self.optimizer = None
		self.setup_optimizer()

		self.train_accuracies = []
		self.train_losses = []

		self.test_accuracies = []
		self.test_losses = []

	def setup_optimizer(self):
		self.optimizer = optimizers.AdaDelta()
		self.optimizer.setup(self.model)

	@property
	def xp(self):
		return cuda.cupy if self.gpu >= 0 else numpy

	def pre_train(self, n_epoch=20, batchsize=100):
		first_inputs = self.data

		# initialize first dAE
		self.dae1 = DA(self.rng,
					   data=first_inputs,
					   n_inputs=self.n_inputs,
					   n_hidden=self.n_hidden[0],
					   corruption_level=self.corruption_levels[0],
					   gpu=self.gpu)
		# train first dAE
		logging.info("--------First DA training has started!--------")
		self.dae1.train_and_test(n_epoch=n_epoch, batchsize=batchsize)
		self.dae1.to_cpu()
		# compute second iputs for second dAE
		tmp1 = self.dae1.compute_hidden(first_inputs[0])
		tmp2 = self.dae1.compute_hidden(first_inputs[1])
		if self.gpu >= 0:
			self.dae1.to_gpu()
		second_inputs = [tmp1, tmp2]

		# initialize second dAE
		self.dae2 = DA(
			self.rng,
			data=second_inputs,
			n_inputs=self.n_hidden[0],
			n_hidden=self.n_hidden[1],
			corruption_level=self.corruption_levels[1],
			gpu=self.gpu
		)
		# train second dAE
		logging.info("--------Second DA training has started!--------")
		self.dae2.train_and_test(n_epoch=n_epoch, batchsize=batchsize)
		self.dae2.to_cpu()
		# compute third inputs for third dAE
		tmp1 = self.dae2.compute_hidden(second_inputs[0])
		tmp2 = self.dae2.compute_hidden(second_inputs[1])
		if self.gpu >= 0:
			self.dae2.to_gpu()
		third_inputs = [tmp1, tmp2]

		# initialize third dAE
		self.dae3 = DA(
			self.rng,
			data=third_inputs,
			n_inputs=self.n_hidden[1],
			n_hidden=self.n_hidden[2],
			corruption_level=self.corruption_levels[2],
			gpu=self.gpu
		)
		# train third dAE
		logging.info("--------Third DA training has started!--------")
		self.dae3.train_and_test(n_epoch=n_epoch, batchsize=batchsize)

		# update model parameters
		self.model.l1 = self.dae1.encoder()
		self.model.l2 = self.dae2.encoder()
		self.model.l3 = self.dae3.encoder()

		self.setup_optimizer()

	def forward(self, x_data, y_data, train=True):
		x, t = Variable(x_data), Variable(y_data)
		h1 = F.dropout(F.relu(self.model.l1(x)), train=train)
		h2 = F.dropout(F.relu(self.model.l2(h1)), train=train)
		h3 = F.dropout(F.relu(self.model.l3(h2)), train=train)
		y = self.model.l4(h3)
		return F.softmax_cross_entropy(y, t), F.accuracy(y, t)

	def fine_tune(self, n_epoch=20, batchsize=100):
		for epoch in xrange(1, n_epoch+1):
			logging.info('fine tuning epoch {}'.format(epoch))

			perm = self.rng.permutation(self.n_train)
			sum_accuracy = 0
			sum_loss = 0
			for i in xrange(0, self.n_train, batchsize):
				x_batch = self.xp.asarray(self.x_train[perm[i:i+batchsize]])
				y_batch = self.xp.asarray(self.y_train[perm[i:i+batchsize]])

				real_batchsize = len(x_batch)

				self.optimizer.zero_grads()
				loss, acc = self.forward(x_batch, y_batch)
				loss.backward()
				self.optimizer.update()

				sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize
				sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize

			logging.info(
				'fine tuning train mean loss={}, accuracy={}'.format(
					sum_loss / self.n_train,
					sum_accuracy / self.n_train
				)
			)
			self.train_accuracies.append(sum_accuracy / self.n_train)
			self.train_losses.append(sum_loss / self.n_train)

			# evaluation
			sum_accuracy = 0
			sum_loss = 0
			for i in xrange(0, self.n_test, batchsize):
				x_batch = self.xp.asarray(self.x_test[i:i+batchsize])
				y_batch = self.xp.asarray(self.y_test[i:i+batchsize])

				real_batchsize = len(x_batch)

				loss, acc = self.forward(x_batch, y_batch, train=False)

				sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize
				sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize

			logging.info(
				'fine tuning test mean loss={}, accuracy={}'.format(
					sum_loss / self.n_test,
					sum_accuracy / self.n_test
				)
			)
			self.test_accuracies.append(sum_accuracy / self.n_test)
			self.test_losses.append(sum_loss / self.n_test)

		return self.train_accuracies, self.test_accuracies
Example #16
0
class MLP(Base):
    def __init__(self, data=None, target=None, n_inputs=784, n_hidden=784, n_outputs=10, gpu=-1):
        self.excludes.append('xp')
        self.model = FunctionSet(l1=F.Linear(n_inputs, n_hidden),
                                 l2=F.Linear(n_hidden, n_hidden),
                                 l3=F.Linear(n_hidden, n_outputs))

        if gpu >= 0:
            self.model.to_gpu()
            self.xp = cuda.cupy
        else:
            self.xp = np

        if not data is None:
            self.x_train, self.x_test = data
        else:
            self.x_train, self.y_test = None, None

        if not target is None:
            self.y_train, self.y_test = target
            self.n_train = len(self.y_train)
            self.n_test = len(self.y_test)
        else:
            self.y_train, self.y_test = None, None
            self.n_train = 0
            self.n_test = 0

        self.gpu = gpu
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model)

    def forward(self, x_data, y_data, train=True):
        x, t = Variable(x_data), Variable(y_data)
        h1 = F.dropout(F.relu(self.model.l1(x)), train=train)
        h2 = F.dropout(F.relu(self.model.l2(h1)), train=train)
        y = self.model.l3(h2)
        return F.softmax_cross_entropy(y, t), F.accuracy(y, t)

    def train_and_test(self, n_epoch=20, batchsize=100):
        for epoch in xrange(1, n_epoch+1):
            print 'epoch', epoch

            perm = np.random.permutation(self.n_train)
            sum_accuracy = 0
            sum_loss = 0
            for i in xrange(0, self.n_train, batchsize):
                x_batch = self.xp.asarray(self.x_train[perm[i:i+batchsize]])
                y_batch = self.xp.asarray(self.y_train[perm[i:i+batchsize]])

                real_batchsize = len(x_batch)

                self.optimizer.zero_grads()
                loss, acc = self.forward(x_batch, y_batch)
                loss.backward()
                self.optimizer.update()

                sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize
                sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize

            print 'train mean loss={}, accuracy={}'.format(sum_loss/self.n_train, sum_accuracy/self.n_train)

            # evalation
            sum_accuracy = 0
            sum_loss = 0
            for i in xrange(0, self.n_test, batchsize):
                x_batch = self.xp.asarray(self.x_test[i:i+batchsize])
                y_batch = self.xp.asarray(self.y_test[i:i+batchsize])

                real_batchsize = len(x_batch)

                loss, acc = self.forward(x_batch, y_batch, train=False)

                sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize
                sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize

            print 'test mean loss={}, accuracy={}'.format(sum_loss/self.n_test, sum_accuracy/self.n_test)
    plt.gray()
    plt.tick_params(labelbottom="off")
    plt.tick_params(labelleft="off")
plt.figure(figsize=(15,15))


cnt = 0
# for idx in np.random.permutation(N)[:1000]:
for idx in range(N):
    if mod(idx , 1000):
        pass
        
    xxx = x_train[idx].astype(np.float32)
    h1 = F.dropout(F.relu(model.l1(Variable(xxx.reshape(1,784)))), train=False)
    h2 = F.dropout(F.relu(model.l2(h1)), train=False)
    y = model.l3(h2)
    
    # 間違えだけ表示
    if y_train[idx] != np.argmax(y.data):
        cnt += 1
        draw_digit3(x_train[idx], cnt, y_train[idx], np.argmax(y.data))
plt.show()
print("Fin")


# ## 第一層パラメータWの可視化

# In[140]:

def draw_digit2(data, n, i):
    size = 28
Example #18
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100  #10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  #10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."

        print "Model Building"
        self.CNN_model = FunctionSet(
            l1=F.Convolution2D(4,
                               32,
                               ksize=8,
                               stride=4,
                               nobias=False,
                               wscale=np.sqrt(2)),
            l2=F.Convolution2D(32,
                               64,
                               ksize=4,
                               stride=2,
                               nobias=False,
                               wscale=np.sqrt(2)),
            l3=F.Convolution2D(64,
                               64,
                               ksize=3,
                               stride=1,
                               nobias=False,
                               wscale=np.sqrt(2)),
        )

        self.model = FunctionSet(
            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
            q_value=F.Linear(512,
                             self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 512),
                                               dtype=np.float32))).to_gpu()

        d = 'elite/'

        self.CNN_model.l1.W.data = np.load(d +
                                           'l1_W.npy')  #.astype(np.float32)
        self.CNN_model.l1.b.data = np.load(d +
                                           'l1_b.npy')  #.astype(np.float32)
        self.CNN_model.l2.W.data = np.load(d +
                                           'l2_W.npy')  #.astype(np.float32)
        self.CNN_model.l2.b.data = np.load(d +
                                           'l2_b.npy')  #.astype(np.float32)
        self.CNN_model.l3.W.data = np.load(d +
                                           'l3_W.npy')  #.astype(np.float32)
        self.CNN_model.l3.b.data = np.load(d +
                                           'l3_b.npy')  #.astype(np.float32)

        self.CNN_model = self.CNN_model.to_gpu()
        self.CNN_model_target = copy.deepcopy(self.CNN_model)
        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool),
            np.zeros((self.data_size, 1), dtype=np.uint8)
        ]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = Variable(
            cuda.to_gpu(
                np.zeros((self.replay_size, self.num_of_actions),
                         dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time, state, action, lstm_reward, state_dash,
                        episode_end_flag, ale_reward):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[5][data_index] = ale_reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[3][data_index] = state_dash
            self.D[5][data_index] = ale_reward
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.CNN_model.l1(state /
                                      254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        #test now
        #print h3.data.shape
        Q = self.model.q_value(h4)
        return Q

    def Q_func_LSTM(self, state):
        h1 = F.relu(self.CNN_model.l1(state /
                                      254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        return h3.data.get()

    def Q_func_target(self, state):
        h1 = F.relu(self.CNN_model_target.l1(
            state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model_target.l2(h1))
        h3 = F.relu(self.CNN_model_target.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        Q = self.model_target.q_value(h4)
        return Q

    def LSTM_reward(self, lstm_out, state_next):
        lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2))
        return lstm_reward

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #19
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**2  # Target update frequancy. original
    data_size = 10**5  # Data size of history. original
     
    #actions are 0 => do nothing, 1 -> buy, -1 sell
    def __init__(self, input_vector_length,enable_controller=[0, 1, 2]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"
        self.input_vector_length = input_vector_length

        print "Initializing DQN..."
#   Initialization for Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()
        
        #inputs --> 5 * 14 (with 10 temporality) + 5 (of last one hour) + 5 (of last 24 hour)
        print "Model Building"
        self.model = FunctionSet(
            l1=F.Linear(input_vector_length, 500),
            l2=F.Linear(500, 250),
            l3=F.Linear(250, 80),
            q_value=F.Linear(80, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 80),
                                               dtype=np.float32))
        ).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        #todo might want to normalize input, but for now I will do that outside this class 
        h1 = F.relu(self.model.l1(state))  
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"

        return self.index_to_action(index_action), Q
    
    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)
    
    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #20
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99                       # Discount factor
    initial_exploration = 5*10**4      # 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32                   # Replay (batch) size
    target_model_update_freq = 10**4   # Target update frequancy. original: 10^4
    data_size = 10**6                  # Data size of history. original: 10^6
    num_of_actions = 2                 # Action dimention
    num_of_states = 12                 # State dimention
    
    def __init__(self):
                  
        print "Initializing DQN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
#        self.model = FunctionSet(
#            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
#            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
#            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
#            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
#            q_value=F.Linear(512, self.num_of_actions,
#                             initialW=np.zeros((self.num_of_actions, 512),
#                                               dtype=np.float32))
#        ).to_gpu()
        
#        self.critic = FunctionSet(
#            l1=F.Linear(self.num_of_actions+self.num_of_states,512),
#            l2=F.Linear(512,256),
#            l3=F.Linear(256,128),
#            q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32))
#        ).to_gpu()
#        
#        self.actor = FunctionSet(
#            l1=F.Linear(self.num_of_states,512),
#            l2=F.Linear(512,256),
#            l3=F.Linear(256,128),
#            a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32))
#        ).to_gpu()
        
        self.critic = FunctionSet(
            l1=F.Linear(self.num_of_actions+self.num_of_states,1024),
            l2=F.Linear(1024,512),
            l3=F.Linear(512,256),
            l4=F.Linear(256,128),
            q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32))
        ).to_gpu()
        
        self.actor = FunctionSet(
            l1=F.Linear(self.num_of_states,1024),
            l2=F.Linear(1024,512),
            l3=F.Linear(512,256),
            l4=F.Linear(256,128),
            a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32))
        ).to_gpu()
        
#        self.critic = FunctionSet(
#            l1=F.Linear(self.num_of_actions+self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_actions+self.num_of_states)),
#            l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)),
#            l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)),
#            l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)),
#            q_value=F.Linear(128,1,wscale=0.01*math.sqrt(128))
#        ).to_gpu()
#        
#        self.actor = FunctionSet(
#            l1=F.Linear(self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_states)),
#            l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)),
#            l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)),
#            l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)),
#            a_value=F.Linear(128,self.num_of_actions,wscale=0.01*math.sqrt(128))
#        ).to_gpu()
        
        self.critic_target = copy.deepcopy(self.critic) 
        self.actor_target = copy.deepcopy(self.actor)
        
        print "Initizlizing Optimizer"
        #self.optim_critic = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001)
        #self.optim_actor = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optim_critic = optimizers.Adam(alpha=0.00001)
        self.optim_actor = optimizers.Adam(alpha=0.00001)
        self.optim_critic.setup(self.critic)
        self.optim_actor.setup(self.actor)
        
#        self.optim_critic.add_hook(chainer.optimizer.WeightDecay(0.00001))
#        self.optim_critic.add_hook(chainer.optimizer.GradientClipping(10))
#        self.optim_actor.add_hook(chainer.optimizer.WeightDecay(0.00001))
#        self.optim_actor.add_hook(chainer.optimizer.GradientClipping(10))

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.num_of_states), dtype=np.float32),
                  np.zeros((self.data_size, self.num_of_actions), dtype=np.float32),
                  np.zeros((self.data_size, 1), dtype=np.float32),
                  np.zeros((self.data_size, self.num_of_states), dtype=np.float32),
                  np.zeros((self.data_size, 1), dtype=np.bool)]
                  
#        with open('dqn_dump.json', 'a') as f:
#            json.dump(datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), f)
#            f.write('\n')
#            json.dump({"alpha": 0.00001, "beta1": 0.7, "beta2": 0.999, "weight_decay": 0.00001}, f)
#            f.write('\n')
#            f.close()
        #self.x_PID = Hover_PID_Controller(12.1, 1.25)
        #self.y_PID = Hover_PID_Controller(12.1, 1.25)

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        
        s = Variable(cuda.to_gpu(np.concatenate([state, action],1)))
        s_dash = Variable(cuda.to_gpu(state_dash))

        Q = self.Q_func(s)  # Get Q-value
        
        # Generate Target through target nets
        action_dash_tmp = self.A_func_target(s_dash) 
        action_dash = np.asanyarray(action_dash_tmp.data.get(), dtype=np.float32)
        tmp_dash = Variable(cuda.to_gpu(np.concatenate([state_dash, action_dash],1)))
        Q_dash_tmp = self.Q_func_target(tmp_dash)
        Q_dash = np.asanyarray(Q_dash_tmp.data.get(), dtype=np.float32)       
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = Reward[i] + self.gamma * Q_dash[i]
            else:
                tmp_ = Reward[i]

            target[i] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, 1), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        
        return loss, Q

    def updateActor(self, state):
        num_of_batch = state.shape[0]
        A_max = 1.0
        A_min = -1.0
        
        A = self.A_func(Variable(cuda.to_gpu(state)))
        tmp = Variable(cuda.to_gpu(np.concatenate([state, A.data.get()],1)))
        Q = self.Q_func(tmp)
        
        # Backward prop towards actor net
        #self.critic.zerograds()
        #self.actor.zerograds()
        Q.grad = cuda.to_gpu(np.ones((num_of_batch, 1), dtype=np.float32)*(-1.0))
#        Q.grad = Q.data*(-1.0)
        Q.backward()
        A.grad = tmp.grad[:,-self.num_of_actions:]
        print("sample_A.grad: "+str(A.grad[0]))
        for i in xrange(num_of_batch):
            for j in xrange(self.num_of_actions):
                if A.grad[i][j] < 0:
                    A.grad[i][j] *= (A_max-A.data[i][j])/(A_max-A_min)
                elif A.grad[i][j] > 0:
                    A.grad[i][j] *= (A.data[i][j]-A_min)/(A_max-A_min)
            
        A.backward()
        self.optim_actor.update()
        print("sample_A.grad: "+str(A.grad[0]))
        
    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))
                #reward_list = list(self.D[2])
                #replay_index = [i[0] for i in sorted(enumerate(reward_list),key=itemgetter(1),reverse=True)[:32]]
                #replay_index = np.asarray(replay_index).reshape(32,1)
                
            s_replay = np.ndarray(shape=(self.replay_size, self.num_of_states), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, self.num_of_actions), dtype=np.float32)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.num_of_states), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = np.asarray(self.D[1][replay_index[i]], dtype=np.float32)
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.asarray(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            #s_replay = cuda.to_gpu(s_replay)
            #s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based critic update
            self.optim_critic.zero_grads()
            loss, q = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optim_critic.update()
            
            # Update the actor
            self.optim_critic.zero_grads()
            self.optim_actor.zero_grads()
            self.updateActor(s_replay)
            
            self.soft_target_model_update()
            
            print "AVG_Q %f" %(np.average(q.data.get()))
            print("loss " + str(loss.data))
            
#            with open('dqn_dump.json', 'a') as f:
#                json.dump({"time": time, "avg_Q": float(np.average(q.data.get())), "loss": float(loss.data)}, f)
#                f.write('\n')
#                f.close()

    def Q_func(self, state):
#        h1 = F.relu(self.critic.l1(state))
#        h2 = F.relu(self.critic.l2(h1))
#        h3 = F.relu(self.critic.l3(h2))
#        Q = self.critic.q_value(h3)
        h1 = F.relu(self.critic.l1(state))
        h2 = F.relu(self.critic.l2(h1))
        h3 = F.relu(self.critic.l3(h2))
        h4 = F.relu(self.critic.l4(h3))
        Q = self.critic.q_value(h4)
        return Q

    def Q_func_target(self, state):
#        h1 = F.relu(self.critic_target.l1(state))
#        h2 = F.relu(self.critic_target.l2(h1))
#        h3 = F.relu(self.critic.l3(h2))
#        Q = self.critic_target.q_value(h3)   
        h1 = F.relu(self.critic_target.l1(state))
        h2 = F.relu(self.critic_target.l2(h1))
        h3 = F.relu(self.critic_target.l3(h2))
        h4 = F.relu(self.critic.l4(h3))
        Q = self.critic_target.q_value(h4)
        return Q
        
    def A_func(self, state):
#        h1 = F.relu(self.actor.l1(state))
#        h2 = F.relu(self.actor.l2(h1))
#        h3 = F.relu(self.actor.l3(h2))
#        A = self.actor.a_value(h3)
        h1 = F.relu(self.actor.l1(state))
        h2 = F.relu(self.actor.l2(h1))
        h3 = F.relu(self.actor.l3(h2))
        h4 = F.relu(self.actor.l4(h3))
        A = self.actor.a_value(h4)
        return A

    def A_func_target(self, state):
#        h1 = F.relu(self.actor_target.l1(state))
#        h2 = F.relu(self.actor_target.l2(h1))
#        h3 = F.relu(self.actor.l3(h2))
#        A = self.actor_target.a_value(h3)
        h1 = F.relu(self.actor_target.l1(state))
        h2 = F.relu(self.actor_target.l2(h1))
        h3 = F.relu(self.actor_target.l3(h2))
        h4 = F.relu(self.actor.l4(h3))
        A = self.actor_target.a_value(h4)
        return A

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        A = self.A_func(s)
        A = A.data
        if np.random.rand() < epsilon:
            action = np.random.uniform(-1.,1.,(1,self.num_of_actions)).astype(np.float32)
#            action = np.zeros((1,self.num_of_actions),dtype=np.float32)
#            if state[0,0] > 0:
#                action[0,0] = np.random.uniform(0.0,0.5)
#            elif state[0,0] < 0:
#                action[0,0] = np.random.uniform(-0.5,0.0)                
#            if state[0,1] < 0:            
#                action[0,1] = np.random.uniform(0.0,0.5)
#            elif state[0,1] > 0:
#                action[0,1] = np.random.uniform(-0.5,0.0)
            #print("teststate"+str(state))
            #action[0,0] = -self.x_PID.getCorrection(state[0][0], 0.0)
            #action[0,1] = self.y_PID.getCorrection(state[0][1], 0.0)
            print "RANDOM"
        else:
            action = A.get()
            print "GREEDY"
            #print(str(action))
        return action

    def hard_target_model_update(self):
        self.critic_target = copy.deepcopy(self.critic)
        self.actor_target = copy.deepcopy(self.actor)

    def soft_target_model_update(self, tau=0.001):
        self.critic_target.l1.W.data = tau*self.critic.l1.W.data + (1-tau)*self.critic_target.l1.W.data
        self.critic_target.l2.W.data = tau*self.critic.l2.W.data + (1-tau)*self.critic_target.l2.W.data
        self.critic_target.l3.W.data = tau*self.critic.l3.W.data + (1-tau)*self.critic_target.l3.W.data
        self.critic_target.l4.W.data = tau*self.critic.l4.W.data + (1-tau)*self.critic_target.l4.W.data
        self.critic_target.q_value.W.data = tau*self.critic.q_value.W.data + (1-tau)*self.critic_target.q_value.W.data
        self.actor_target.l1.W.data = tau*self.actor.l1.W.data + (1-tau)*self.actor_target.l1.W.data
        self.actor_target.l2.W.data = tau*self.actor.l2.W.data + (1-tau)*self.actor_target.l2.W.data
        self.actor_target.l3.W.data = tau*self.actor.l3.W.data + (1-tau)*self.actor_target.l3.W.data
        self.actor_target.l4.W.data = tau*self.actor.l4.W.data + (1-tau)*self.actor_target.l4.W.data
        self.actor_target.a_value.W.data = tau*self.actor.a_value.W.data + (1-tau)*self.actor_target.a_value.W.data
Example #21
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
        print "CUDA init"
        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)),
            l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)),
            l3=F.Linear(2592, 256),
            q_value=F.Linear(256,
                             self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32))).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002,
                                                  alpha=0.3,
                                                  momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time, state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0, 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"

        return self.index_to_action(index_action), Q

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #22
0
class ConvQAgent(Agent):
	def __init__(self, frames_per_action=4):
		super(ConvQAgent, self).__init__()
		cuda.init()
		self.epsilon = 1.0
		self.gamma = 0.99
		self.iterations = 0
		
		self.model = FunctionSet(
			l1 = F.Convolution2D(frames_per_action, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
			l2 = F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
			l3 = F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
			l4 = F.Linear(64 * 7 * 7, 512),
			l5 = F.Linear(512, 2)
		).to_gpu()

		self.optimizer = optimizers.RMSprop(lr=1e-5)
		self.optimizer.setup(self.model)
		self.update_target()

		self.num_frames = 0
		self.frames_per_action = frames_per_action
		self.prev_reward = 0.0

		self.history = ConvHistory((frames_per_action, 84, 84))

	def update_target(self):
		self.target_model = copy.deepcopy(self.model)
		self.target_model = self.target_model.to_gpu()

	def act(self, state):
		self.update_state_vector(state)

		if self.num_frames < self.frames_per_action - 1 or self.num_frames % self.frames_per_action != 0:
			return None

		if random.random() < 0.001:
			print 'Epsilon: {}'.format(self.epsilon)

		if self.epsilon > 0.05:
			self.epsilon -= (0.95 / 300000)

		if random.random() < self.epsilon:
			return random.random() > 0.375

		q = self.get_q(Variable(cuda.to_gpu(self.curr_state[np.newaxis, :, :, :])))

		if random.random() < 0.01:
			if q.data[0,1] > q.data[0,0]:
				print 'On: {}'.format(q.data)
			else:
				print 'Off: {}'.format(q.data)

		return q.data[0,1] > q.data[0,0]

	def update_state_vector(self, state):
		if self.num_frames < self.frames_per_action:
			if self.num_frames == 0:
				self.curr_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32)
			self.curr_state[self.num_frames, :, :] = state
		else:
			if self.num_frames == self.frames_per_action:
				self.prev_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32)
			self.prev_state[1:, :, :] = self.prev_state[:-1, :, :]
			self.prev_state[0, :, :] = self.curr_state[-1, :, :]

			self.curr_state[1:, :, :] = self.curr_state[:-1, :, :]
			self.curr_state[0, :, :] = state

		self.num_frames += 1

	def accept_reward(self, state, action, reward, new_state, is_terminal):
		self.prev_reward += reward

		if not (is_terminal or self.num_frames % self.frames_per_action == 0):
			return

		if self.num_frames == self.frames_per_action:
			self.prev_reward = 0.0
			self.prev_action = action
			return

		self.history.add((self.prev_state, self.prev_action, self.prev_reward,
			self.curr_state, is_terminal))
		self.prev_reward = 0.0
		self.prev_action = action

		self.iterations += 1
		if self.iterations % 10000 == 0:
			print '*** UPDATING TARGET NETWORK ***'
			self.update_target()
		
		state, action, reward, new_state, is_terminal = self.history.get(num=32)

		state = cuda.to_gpu(state)
		action = cuda.to_gpu(action)
		new_state = cuda.to_gpu(new_state)
		reward = cuda.to_gpu(reward)

		loss, q = self.forward(state, action, reward, new_state, is_terminal)
		self.optimizer.zero_grads()
		loss.backward()
		self.optimizer.update()

	def forward(self, state, action, reward, new_state, is_terminal):
		q = self.get_q(Variable(state))
		q_target = self.get_target_q(Variable(new_state))

		max_target_q = cp.max(q_target.data, axis=1)

		target = cp.copy(q.data)

		for i in xrange(target.shape[0]):
			curr_action = int(action[i, 0])
			if is_terminal[i]:
				target[i, curr_action] = reward[i]
			else:
				target[i, curr_action] = reward[i] + self.gamma * max_target_q[i]
		
		loss = F.mean_squared_error(Variable(target), q)
		return loss, 0.0 #cp.mean(q.data[:, action[i]])

	def get_q(self, state):
		h1 = F.relu(self.model.l1(state))
		h2 = F.relu(self.model.l2(h1))
		h3 = F.relu(self.model.l3(h2))
		h4 = self.model.l4(h3)
		return self.model.l5(h4)

	def get_target_q(self, state):
		h1 = F.relu(self.target_model.l1(state))
		h2 = F.relu(self.target_model.l2(h1))
		h3 = F.relu(self.target_model.l3(h2))
		h4 = self.target_model.l4(h3)
		return self.target_model.l5(h4)

	def save(self, file_name):
		with open(file_name, 'wb') as out_file:
			pickle.dump((self.model, self.optimizer), out_file)

	def load(self, file_name):
		self.epsilon = 0.0

		with open(file_name, 'rb') as in_file:
			model, optimizer = pickle.load(in_file)
			self.model.copy_parameters_from(model.parameters)
			self.optimizer = optimizer

	def start_new_game(self):
		self.num_frames = 0
Example #23
0
    plt.ylim(0, 27)
    plt.pcolor(Z)
    plt.title('ans={}, recog={}'.format(ans, recog), size=8)
    plt.gray()
    plt.tick_params(labelbottom='off')
    plt.tick_params(labelleft='off')


plt.figure(figsize=(15, 15))
cnt = 0
for idx in np.random.permutation(N)[:100]:
    xxx = x_train[idx].astype(np.float32)
    h1 = F.dropout(F.relu(model.l1(Variable(xxx.reshape(1, 784)))),
                   train=False)
    h2 = F.dropout(F.relu(model.l2(h1)), train=False)
    y = model.l3(h2)
    cnt += 1
    draw_digit3(x_train[idx], cnt, y_train[idx], np.argmax(y.data))

plt.show()


def draw_digit2(data, n, i_):
    size = 28
    plt.subplot(10, 10, n)
    Z = data.reshape(size, size)
    Z = Z[::-1, :]  # 上下反転
    plt.xlim(0, 27)
    plt.ylim(0, 27)
    plt.pcolor(Z)
    plt.title('{}'.format(i_), size=9)
Example #24
0
class ChainerAgent(Agent):
	def __init__(self, epsilon=1.0, frames_per_action=4):
		super(ChainerAgent, self).__init__()
		cuda.init()
		self.epsilon = epsilon
		self.gamma = 0.99
		self.iterations = 0
		
		self.model = FunctionSet(
			l1 = F.Linear(9 * frames_per_action, 256),
			l2 = F.Linear(256, 256),
			l3 = F.Linear(256, 256),
			l4 = F.Linear(256, 2),
		).to_gpu()

		self.optimizer = optimizers.RMSprop(lr=1e-5)
		self.optimizer.setup(self.model)
		self.update_target()

		self.num_frames = 0
		self.frames_per_action = frames_per_action
		self.prev_reward = 0.0

		self.history = ChainHistory(state_len=(9 * frames_per_action))

	def forward(self, state, action, reward, new_state, is_terminal):
		q = self.get_q(Variable(state))
		q_target = self.get_target_q(Variable(new_state))

		max_target_q = cp.max(q_target.data, axis=1)

		target = cp.copy(q.data)

		for i in xrange(target.shape[0]):
			curr_action = int(action[i])
			if is_terminal[i]:
				target[i, curr_action] = reward[i]
			else:
				target[i, curr_action] = reward[i] + self.gamma * max_target_q[i]
		
		loss = F.mean_squared_error(Variable(target), q)
		return loss, 0.0 #cp.mean(q.data[:, action[i]])

	def get_q(self, state):
		h1 = F.relu(self.model.l1(state))
		h2 = F.relu(self.model.l2(h1))
		h3 = F.relu(self.model.l3(h2))
		return self.model.l4(h3)

	def get_target_q(self, state):
		h1 = F.relu(self.target_model.l1(state))
		h2 = F.relu(self.target_model.l2(h1))
		h3 = F.relu(self.target_model.l3(h2))
		return self.target_model.l4(h3)

	def accept_reward(self, state, action, reward, new_state, is_terminal):
		self.prev_reward += reward

		if not (is_terminal or self.num_frames % self.frames_per_action == 0):
			return

		if self.num_frames == self.frames_per_action:
			self.prev_reward = 0.0
			self.prev_action = action
			return

		self.history.add((self.prev_state, self.prev_action, self.prev_reward,
			self.curr_state, is_terminal))
		self.prev_reward = 0.0
		self.prev_action = action

		self.iterations += 1
		if self.iterations % 10000 == 0:
			print '*** UPDATING TARGET NETWORK ***'
			self.update_target()
		
		state, action, reward, new_state, is_terminal = self.history.get(num=32)

		state = cuda.to_gpu(state)
		action = cuda.to_gpu(action)
		new_state = cuda.to_gpu(new_state)
		reward = cuda.to_gpu(reward)

		loss, q = self.forward(state, action, reward, new_state, is_terminal)
		self.optimizer.zero_grads()
		loss.backward()
		self.optimizer.update()

	def update_state_vector(self, state):
		if self.num_frames < self.frames_per_action:
			if self.num_frames == 0:
				self.curr_state = state
			else:
				self.curr_state = np.hstack((self.curr_state, state))
		else:
			if self.num_frames < 2 * self.frames_per_action:
				if self.num_frames == self.frames_per_action:
					self.prev_state = np.copy(self.curr_state[:, :9])
				else:
					self.prev_state = np.hstack((self.prev_state, self.curr_state[:, :9]))
			else:
				self.prev_state[:, :-9] = self.prev_state[:, 9:]
				self.prev_state[:, -9:] = self.curr_state[:, :9]

			self.curr_state[:, :-9] = self.curr_state[:, 9:]
			self.curr_state[:, -9:] = state

		self.num_frames += 1

	def act(self, state):
		self.update_state_vector(state)

		if self.num_frames < self.frames_per_action - 1 or self.num_frames % self.frames_per_action != 0:
			return None

		if self.epsilon > 0.05:
			self.epsilon -= (0.95 / 1000000)

		if random.random() < 0.0001:
			print 'Epsilon greedy strategy current epsilon: {}'.format(self.epsilon)

		if random.random() < self.epsilon:
			return random.random() > 0.375

		q = self.get_q(Variable(cuda.to_gpu(self.curr_state)))

		if random.random() < 0.01:
			if q.data[0,1] > q.data[0,0]:
				print 'On: {}'.format(q.data)
			else:
				print 'Off: {}'.format(q.data)

		return q.data[0,1] > q.data[0,0]

	def save(self, file_name):
		with open(file_name, 'wb') as out_file:
			pickle.dump(self.model, out_file)

	def load(self, file_name):
		self.epsilon = 0.0

		with open(file_name, 'rb') as in_file:
			model = pickle.load(in_file)
			self.model.copy_parameters_from(model.parameters)

	def update_target(self):
		self.target_model = copy.deepcopy(self.model)
		self.target_model = self.target_model.to_gpu()

	def start_new_game(self):
		self.num_frames = 0
Example #25
0
class DN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 1, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Breakout"

        print "Initializing DN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            l4=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l5=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)),
            l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32)),
            q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True)
        ).to_gpu()
        
        if args.resumemodel:
            # load saved model
            serializers.load_npz(args.resumemodel, self.model)
            print "load model from resume.model"
        

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        if args.resumeD1 and args.resumeD2:
            # load saved D1 and D2
            npz_tmp1 = np.load(args.resumeD1)
            print "finished loading half of D data"
            npz_tmp2 = np.load(args.resumeD2)
            self.D = [npz_tmp1['D0'],
                      npz_tmp1['D1'],
                      npz_tmp1['D2'],
                      npz_tmp2['D3'],
                      npz_tmp2['D4']]
            npz_tmp1.close()
            npz_tmp2.close()
            print "loaded stored all D data"
        else:
            self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros(self.data_size, dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.int8),
                      np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.bool)]
            print "initialized D data"

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value
        # Generate Target Signals
        tmp2 = self.Q_func(s_dash)
        tmp2 = list(map(np.argmax, tmp2.data.get()))  # argmaxQ(s',a)
        tmp = self.Q_func_target(s_dash)  # Q'(s',*)
        tmp = list(tmp.data.get())
        # select Q'(s',*) due to argmaxQ(s',a)
        res1 = []
        for i in range(num_of_batch):
            res1.append(tmp[i][tmp2[i]])

        #max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        max_Q_dash = np.asanyarray(res1, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)
        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_
        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        print 'now Q_func is implemented'
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        h4 = F.relu(self.model.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model.l5(h3)) # right side connected with A value
        h6 = self.model.l6(h4) # s value
        h7 = self.model.l7(h5) # A value
        Q = self.model.q_value(h6, h7) # Q value
        return Q

    def Q_func_target(self, state):
        print 'now Q_func_target is implemented'
        h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model_target.l2(h1))
        h3 = F.relu(self.model_target.l3(h2))
        h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value
        h6 = self.model_target.l6(h4) # s value
        h7 = self.model_target.l7(h5) # A value
        Q = self.model_target.q_value(h6, h7) # Q value
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #26
0
#!/usr/bin/env python
# coding: utf-8

__author__ = 'k_morishita'

import numpy as np
from chainer import cuda, Function, FunctionSet, gradient_check, Variable, optimizers
import chainer.functions as F


model = FunctionSet(
    l1=F.Linear(4, 3),
    l2=F.Linear(3, 2),
    l3=F.Linear(2, 2)
)

x = Variable(np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.float32))
h1 = model.l1(x)
h2 = model.l2(h1)
h3 = model.l3(h2)

optimizer = optimizers.SGD()
optimizer.setup(model.collect_parameters())
optimizer.zero_grads()

xx = Variable(np.array([[1,2,3,4], [0, 1, 0.5, 0.2]], dtype=np.float32))
print F.accuracy(xx, Variable(np.array([3, 1], dtype=np.int32))).data