def main(log_file, h_sizes, improve_loss_min=0.001):
    x_train, y_train, x_test, y_test = generate_cases(log_file)

    in_size = LINE_MAX_CHAR
    out_size = 2
    layers = [in_size] + h_sizes + [out_size]
    model = FunctionSet()
    for li in range(1, len(layers)):
        setattr(model, "l%d" % li, F.Linear(layers[li-1], layers[li]))

    optimizer = optimizers.SGD()
    optimizer.setup(model.collect_parameters())
    last_loss = None
    for epoch in range(3000000):
        optimizer.zero_grads()
        loss, accuracy = forward(model, x_train, y_train)
        loss.backward()

        if epoch % 100 == 0:
            print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data)
            if last_loss is not None and last_loss - improve_loss_min < loss.data:
                print "Finish Training"
                break
            last_loss = loss.data

        optimizer.update()
        if epoch % 1000 == 0:
            loss, accuracy = forward(model, x_test, y_test)
            print "epoch: %s, Try Test Result: loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data)

    # result
    loss, accuracy = forward(model, x_test, y_test)
    print "epoch: %s, Test Result: loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data)
    return epoch, accuracy.data
Example #2
0
def main(args):
    def forward(x_data, y_data):
        x = Variable(x_data)
        t = Variable(y_data)
        h1 = F.relu(model.l1(x))  # activation function
        h2 = F.relu(model.l2(h1)) # ReLU does not have parameters to optimize
        y = model.l3(h2)
        # the loss function of softmax regression
        return F.softmax_cross_entropy(y, t), F.accuracy(y, t)  # current accuracy

    def evaluate():
        sum_loss, sum_accuracy = 0, 0
        for i in xrange(0, 10000, batchsize):
            x_batch = x_test[i:i+batchsize]
            y_batch = y_test[i:i+batchsize]
            loss, accuracy = forward(x_batch, y_batch)
            sum_loss += loss.data * batchsize
            sum_accuracy += accuracy.data * batchsize
        mean_loss = sum_loss / 10000
        mean_accuracy = sum_accuracy / 10000
        print(mean_loss[0], mean_accuracy)
        return

    global debug, verbose
    debug = args.debug
    if debug == True:
        verbose = True
    else:
        verbose = args.verbose

    mnist = fetch_mldata('MNIST original')
    x_all = mnist.data.astype(np.float32) / 255  # Scaling features to [0, 1]
    y_all = mnist.target.astype(np.int32)
    x_train, x_test = np.split(x_all, [60000])   # 60000 for training, 10000 for test
    y_train, y_test = np.split(y_all, [60000])

    # Simple three layer rectfier network
    model = FunctionSet(
        l1 = F.Linear(784, 100),  # 784 pixels -> 100 units
        l2 = F.Linear(100, 100),  # 100 units -> 100 units
        l3 = F.Linear(100, 10),   # 100 units -> 10 digits
    )
    optimizer = optimizers.SGD()
    optimizer.setup(model.collect_parameters())

    batchsize = 100
    for epoch in xrange(20):
        if verbose: logger.info('epoch: {}'.format(epoch))
        indexes = np.random.permutation(60000)
        for i in xrange(0, 60000, batchsize):
            x_batch = x_train[indexes[i:i+batchsize]]
            y_batch = y_train[indexes[i:i+batchsize]]

            optimizer.zero_grads()  # Initialize gradient arrays
            loss, accuracy = forward(x_batch, y_batch)  # loss function
            loss.backward()  # Backpropagation
            optimizer.update()
        evaluate()

    return 0
Example #3
0
class LinearModel(object):
    UNIT_NUM = 10
    BATCH_SIZE = 32
    EPOCH = 100

    def __init__(self, optimizer):
        self.model = FunctionSet(
            l = Linear(self.UNIT_NUM, 2)
        )
        self.optimizer = optimizer
        # true parameters
        self.w      = np.random.uniform(-1, 1, (self.UNIT_NUM, 1)).astype(np.float32)
        self.b      = np.random.uniform(-1, 1, (1, )).astype(np.float32)

    def _train_linear_classifier(self, model, optimizer, gpu):
        def _make_label(x):
            a = (np.dot(x, self.w) + self.b).reshape((self.BATCH_SIZE, ))
            t = np.empty_like(a).astype(np.int32)
            t[a>=0] = 0
            t[a< 0] = 1
            return t

        def _make_dataset(batch_size, unit_num, gpu):
            x_data = np.random.uniform(-1, 1, (batch_size, unit_num)).astype(np.float32)
            t_data = _make_label(x_data)
            if gpu:
                x_data = cuda.to_gpu(x_data)
                t_data = cuda.to_gpu(t_data)
            x = Variable(x_data)
            t = Variable(t_data)
            return x, t

        for epoch in xrange(self.EPOCH):
            x, t = _make_dataset(self.BATCH_SIZE, self.UNIT_NUM, gpu)
            optimizer.zero_grads()
            y = model.l(x)
            loss = softmax_cross_entropy(y, t)
            loss.backward()
            optimizer.update()

        x_test, t_test = _make_dataset(self.BATCH_SIZE, self.UNIT_NUM, gpu)
        y_test = model.l(x_test)
        return accuracy(y_test, t_test)

    def _accuracy_cpu(self):
        self.optimizer.setup(self.model.collect_parameters())
        return self._train_linear_classifier(self.model, self.optimizer, False)

    def _accuracy_gpu(self):
        model = self.model
        optimizer = self.optimizer
        model.to_gpu()
        optimizer.setup(model.collect_parameters())
        return self._train_linear_classifier(model, optimizer, True)

    def accuracy(self, gpu):
        if gpu:
            return cuda.to_cpu(self._accuracy_gpu().data)
        else:
            return self._accuracy_cpu().data
def main():
    if P.use_mean_var:
        conv6_output = 126
    else:
        conv6_output = 128

    if P.model_name is None:
        model = FunctionSet(
            conv1 = F.Convolution2D( 1, 128, 3, stride=1),
            conv2 = F.Convolution2D(128, 128, 3, stride=1),
            conv3 = F.Convolution2D(128, 128, 3, stride=1),
            conv4 = F.Convolution2D(128, 128, 3, stride=1),
            conv5 = F.Convolution2D(128, 128, 3, stride=1),
            conv6 = F.Convolution2D(128, conv6_output, 3, stride=1),
            conv7 = F.Convolution2D(128, 128, 1, stride=1),
            conv8 = F.Convolution2D(128, 1, 1, stride=1)
            )
        if P.gpu >= 0:
            cuda.init(P.gpu)
            model.to_gpu()
    else:
        if P.gpu >= 0:
            cuda.init(P.gpu)
        model = pickle.load(open(os.path.join(P.model_dir, P.model_name), 'rb'))

    optimizer = optimizers.MomentumSGD(lr=P.lr, momentum=P.momentum)
    optimizer.setup(model.collect_parameters())

    train(model, optimizer)
    return
def setup_model(n_dimention, n_units):

    model = FunctionSet(l1=F.Linear(n_dimention, n_units),
                        l2=F.Linear(n_units, n_dimention))
    # Setup optimizer
    optimizer = optimizers.Adam()
    optimizer.setup(model.collect_parameters())

    return model, optimizer
Example #6
0
class ConvolutionalDenoisingAutoencoder():
    def __init__(self, imgsize, n_in_channels, n_out_channels, ksize, stride=1, pad=0, use_cuda=False):
        self.model = FunctionSet(
            encode=F.Convolution2D(n_in_channels, n_out_channels, ksize, stride, pad),
            decode=F.Linear(n_out_channels*(math.floor((imgsize+2*pad-ksize)/stride)+1)**2, n_in_channels*imgsize**2)
        )
        self.use_cuda = use_cuda

        if self.use_cuda:
            self.model.to_gpu()

        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())

    def encode(self, x_var):
        return F.sigmoid(self.model.encode(x_var))

    def decode(self, x_var):
        return self.model.decode(x_var)

    def predict(self, x_data):
        if self.use_cuda:
            x_data = cuda.to_gpu(x_data)
        x = Variable(x_data)
        p = self.encode(x)
        if self.use_cuda:
            return cuda.to_cpu(p.data)
        else:
            return p.data

    def cost(self, x_data):
        x = Variable(x_data)
        t = Variable(x_data.reshape(x_data.shape[0], x_data.shape[1]*x_data.shape[2]*x_data.shape[3]))
        h = F.dropout(x)
        h = self.encode(h)
        y = self.decode(h)
        return F.mean_squared_error(y, t)

    def train(self, x_data):
        if self.use_cuda:
            x_data = cuda.to_gpu(x_data)
        self.optimizer.zero_grads()
        loss = self.cost(x_data)
        loss.backward()
        self.optimizer.update()
        if self.use_cuda:
            return float(cuda.to_cpu(loss.data))
        else:
            return loss.data

    def test(self, x_data):
        if self.use_cuda:
            x_data = cuda.to_gpu(x_data)
        loss = self.cost(x_data)
        return float(cuda.to_cpu(loss.data))
Example #7
0
class NNQLearningPlayer(object):
    ALPHA = 0.1
    GAMMA = 0.99
    E_GREEDY = 0.3

    def __init__(self):
        self.actions = [1, 2, 3, 4]
        self.model = FunctionSet(
            l1=F.EmbedID(10, 10),
            l2=F.Linear(10, 10),
            l3=F.Linear(10, 4),
        )
        self.optimizer = optimizers.SGD()
        self.optimizer.setup(self.model.collect_parameters())
        self.last_action = None
        self.last_q_list = None
        self.training = True

    def action(self, state, last_reward):
        if self.last_action is not None and self.training:
            self.update_q_table(self.last_action, state, last_reward)
        next_action = self.select_action(state)
        self.last_action = next_action
        return self.actions[next_action]

    def forward(self, state):
        x = Variable(np.array([state], dtype=np.int32))
        y = None
        for i in range(1, 1000):  # 1000 は適当な数
            if hasattr(self.model, "l%d" % i):
                x = getattr(self.model, "l%d" % i)(x)
            else:
                y = x
                break
        return y

    def select_action(self, state):
        self.last_q_list = self.forward(state)
        if self.training and random() < self.E_GREEDY:  # http://www.sist.ac.jp/~kanakubo/research/reinforcement_learning.html
            return randint(0, len(self.actions)-1)
        else:
            return np.argmax(self.last_q_list.data)

    def update_q_table(self, last_action, cur_state, last_reward):
        target_val = last_reward + self.GAMMA * np.max(self.forward(cur_state).data)
        self.optimizer.zero_grads()
        # 結構無理やりLossを計算・・・ この辺の実装は自信がない
        tt = np.copy(self.last_q_list.data)
        tt[0][last_action] = target_val
        target = Variable(tt)
        loss = 0.5 * (target - self.last_q_list) ** 2
        loss.grad = np.array([[self.ALPHA]], dtype=np.float32)
        loss.backward()
        self.optimizer.update()
class DenoisingAutoencoder:
    def __init__(
        self,
        n_input,
        n_hidden,
        tied=True,
        noise=None,
        ratio=None,
        optimizer=optimizers.Adam(),
        loss_function=F.sigmoid_cross_entropy,
        activation_function=F.sigmoid,
    ):
        self.model = FunctionSet(encoder=F.Linear(n_input, n_hidden), decoder=F.Linear(n_hidden, n_input))
        if tied:
            self.model.decoder.W = self.model.encoder.W.T
        self.noise = noise
        self.ratio = ratio
        self.optimizer = optimizer
        self.optimizer.setup(self.model.collect_parameters())
        self.loss_function = loss_function
        self.activation_function = activation_function

    def train(self, x_data):
        self.optimizer.zero_grads()
        loss = self.autoencode(x_data, train=True)
        loss.backward()
        self.optimizer.update()
        return loss

    def test(self, x_data):
        return self.autoencode(x_data, train=False)

    def autoencode(self, x_data, train=True):
        x = Variable(x_data)
        if self.noise and train:
            nx = Variable(self.noise.noise(x_data))
        else:
            nx = Variable(x_data)
        if self.ratio:
            h = F.dropout(self.encode(nx), ratio=self.ratio, train=train)
        else:
            h = self.encode(nx)
        y = self.decode(h)
        return self.loss_function(y, x)

    def encode(self, x):
        return self.activation_function(self.model.encoder(x))

    def decode(self, x):
        return self.activation_function(self.model.decoder(x))
Example #9
0
def setup_model(gpu_id, n_channel, n_output):
    model = FunctionSet(
        conv1=F.Convolution2D(n_channel, 32, 5, pad=2),
        conv2=F.Convolution2D(32, 32, 5, pad=2),
        conv3=F.Convolution2D(32, 64, 5, pad=2),
        fl5=F.Linear(960, 64),
        fl6=F.Linear(64, n_output),
    )
    # optimizer = optimizers.MomentumSGD(lr=1e-03)
    optimizer = optimizers.AdaGrad()
    optimizer.setup(model.collect_parameters())

    mlp = ChainerModel(model, optimizer, forward_function=forward)
    return mlp
class DeepLearning:
    def __init__(self, input_size, hidden_size, output_size):
        self.model = FunctionSet(l1=F.Linear(input_size, hidden_size),
                    l2=F.Linear(hidden_size, hidden_size),
                    l3=F.Linear(hidden_size, output_size))
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())



    def batch(self, X_train, y_train, batch_size, perm):
        train_size = X_train.shape[0]

        for i in xrange(0, train_size, batch_size):
            X_batch = X_train[perm[i: i+batch_size]]
            y_batch = y_train[perm[i: i+batch_size]]

            # Chainer用に型変換
            x = Variable(X_batch)
            t = Variable(y_batch)

            self.optimizer.zero_grads()
            y = self.forward(x)  # 予測結果

            loss = F.softmax_cross_entropy(y, t)
            loss.backward()

            self.optimizer.update()


    def forward(self, x, train=True):
        h1 = F.dropout(F.sigmoid(self.model.l1(x)),  train=train)
        h2 = F.dropout(F.sigmoid(self.model.l2(h1)), train=train)
        return self.model.l3(h2)


    def predicate(self, x_data):
        x = np.array([x_data], dtype=np.float32)
        x = Variable(x)
        y = self.forward(x, train=False)
        return np.argmax(y.data)


    def save(self, fpath):
        pickle.dump(self.model, open(fpath, 'wb'), -1)


    def load(self, fpath):
        self.model = pickle.load(open(fpath,'rb'))
Example #11
0
def main(n_bit, h_sizes):
    in_size = n_bit+n_bit+1
    out_size = 2**(n_bit+1)
    layers = [in_size] + h_sizes + [out_size]
    model = FunctionSet()
    for li in range(1, len(layers)):
        setattr(model, "l%d" % li, F.Linear(layers[li-1], layers[li]))
    optimizer = optimizers.SGD()
    optimizer.setup(model.collect_parameters())
    x_data, t_data = generate_training_cases(n_bit)
    for epoch in range(3000000):
        optimizer.zero_grads()
        loss, accuracy = forward(model, x_data, t_data)
        loss.backward()
        if epoch % 100 == 0:
            print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data)
        if accuracy.data == 1:
            break
        optimizer.update()
    print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data)
    return epoch, accuracy.data
Example #12
0
class TestNestedFunctionSet(TestCase):
    def setUp(self):
        self.fs1 = FunctionSet(
            a = MockFunction((1, 2)))
        self.fs2 = FunctionSet(
            fs1 = self.fs1,
            b  = MockFunction((3, 4)))

    def test_get_sorted_funcs(self):
        self.assertItemsEqual([k for (k, v) in self.fs2._get_sorted_funcs()], ('b', 'fs1'))

    def test_collect_parameters(self):
        p_b = np.zeros((3, 4)).astype(np.float32)
        p_a = np.zeros((1, 2)).astype(np.float32)
        gp_b = np.ones((3, 4)).astype(np.float32)
        gp_a = np.ones((1, 2)).astype(np.float32)

        actual = self.fs2.collect_parameters()
        self.assertTrue(map(len, actual) == [2, 2])
        self.assertTrue((actual[0][0] == p_b).all())
        self.assertTrue((actual[0][1] == p_a).all())
        self.assertTrue((actual[1][0] == gp_b).all())
        self.assertTrue((actual[1][1] == gp_a).all())

    def test_pickle_cpu(self):
        fs2_serialized = pickle.dumps(self.fs2)
        fs2_loaded = pickle.loads(fs2_serialized)
        self.assertTrue((self.fs2.b.p == fs2_loaded.b.p).all())
        self.assertTrue((self.fs2.fs1.a.p == fs2_loaded.fs1.a.p).all())

    @attr.gpu
    def test_pickle_gpu(self):
        self.fs2.to_gpu()
        fs2_serialized = pickle.dumps(self.fs2)
        fs2_loaded = pickle.loads(fs2_serialized)
        fs2_loaded.to_cpu()
        self.fs2.to_cpu()

        self.assertTrue((self.fs2.b.p == fs2_loaded.b.p).all())
        self.assertTrue((self.fs2.fs1.a.p == fs2_loaded.fs1.a.p).all())
class MNISTNet():
    def __init__(self):
        n_in = 28 * 28
        n_hidden = 100
        self.model = FunctionSet(
            encode=F.Linear(n_in, n_hidden),
            decode=F.Linear(n_hidden, n_in)
        )
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())

    def encode(self, x_var):
        return F.sigmoid(self.model.encode(x_var))

    def decode(self, x_var):
        return F.sigmoid(self.model.decode(x_var))

    def predict(self, x_data):
        x = Variable(x_data)
        p = self.encode(x)
        return p.data

    def cost(self, x_data, dropout=True):
        x = Variable(x_data)
        t = Variable(x_data)
        if dropout:
            x_n = F.dropout(x, ratio=0.4)
        else: 
        	x_n = x
        h = self.encode(x_n)
        y = self.decode(h)
        return F.mean_squared_error(y, t)

    def train(self, x_data):
        self.optimizer.zero_grads()
        loss = self.cost(x_data)
        loss.backward()
        self.optimizer.update()
        return float(loss.data)
Example #14
0
def main(n_bit, h1_size):
    if h1_size > 0:
        model = FunctionSet(
            l1=F.Linear(n_bit, h1_size),
            l2=F.Linear(h1_size, 2**n_bit)
        )
    else:
        model = FunctionSet(
            l1=F.Linear(n_bit, 2**n_bit)
        )
    optimizer = optimizers.SGD()
    optimizer.setup(model.collect_parameters())
    x_data, t_data = generate_training_cases(n_bit)
    for epoch in range(100000):
        optimizer.zero_grads()
        loss, accuracy = forward(model, x_data, t_data)
        loss.backward()
        if epoch % 100 == 0:
            print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data)
        if accuracy.data == 1:
            break
        optimizer.update()
    print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data)
    return epoch, accuracy.data
class DQN_class:
	gamma = 0.99
	#initial_exploration = 10**2
	initial_exploration = 10**2
	replay_size = 32  # Replay (batch) size
	target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
	#data_size = 10**6
	data_size = 10**6

	def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8]):
		#	"""	[ 0, 0],
		#		[ 0, 1],
		#		[ 0,-1],
		#		[ 1, 0],
		#		[ 1, 1],
		#		[ 1,-1],
		#		[-1, 0],
		#		[-1, 1],
		#		[-1,-1]]):"""
		self.num_of_actions = len(enable_controller)
		self.enable_controller = enable_controller

		print "Initializing DQN..."
		print "CUDA init"
		#cuda.init()

		print "Model Building"
		self.model = FunctionSet(
			#l1 = F.Linear(INPUT_SIZE, 5000),	# input map[100, 100] + v[2] + w[1] + wp[2]
			l1 = F.Linear(INPUT_SIZE, 100),	# input map[100, 100] + v[2] + w[1] + wp[2]
			#l2 = F.Linear(5000, 1000),
			#l3 = F.Linear(1000, 500),
			#l4 = F.Linear(500, 100),
			#l5 = F.Linear(100, self.num_of_actions,
			l2 = F.Linear(100, self.num_of_actions,
						initialW=np.zeros((self.num_of_actions, 100), dtype=np.float32))
		).to_gpu()

		self.model_target = copy.deepcopy(self.model)
		
		print "Initizlizing Optimizer"
		self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)	### 重要!!!!  RMSProp!!
		self.optimizer.setup(self.model.collect_parameters())

		# History Data :  D=[s, a, r, s_dash, end_episode_flag]
		self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32),
				  np.zeros(self.data_size, dtype=np.uint8),
				  np.zeros((self.data_size, 1), dtype=np.float32),
				  np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32),
				  np.zeros((self.data_size, 1), dtype=np.bool)]
		#self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8),
		#		  np.zeros(self.data_size, dtype=np.uint8),
		#		  np.zeros((self.data_size, 1), dtype=np.int8),
		#		  np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8),
		#		  np.zeros((self.data_size, 1), dtype=np.bool)]

	def forward(self, state, action, Reward, state_dash, episode_end):
		num_of_batch = state.shape[0]
		s = Variable(state)
		s_dash = Variable(state_dash)

		Q = self.Q_func(s)  # Get Q-value

		# Generate Target Signals
		tmp = self.Q_func_target(s_dash)  # Q(s',*)
		tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
		max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
		target = np.asanyarray(Q.data.get(), dtype=np.float32)

		for i in xrange(num_of_batch):
			if not episode_end[i][0]:
				tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
			else:
				tmp_ = np.sign(Reward[i])

			#action_index = self.action_to_index(action[i])
			#target[i, action_index] = tmp_
			target[i, action[i]] = tmp_

		# TD-error clipping
		td = Variable(cuda.to_gpu(target)) - Q  # TD error
		#print "td-error"
		#print "np.max(td.data) : ",
		#print np.max(td.data.get())
		# 何のためにあるのか不明	td = td_clipとなっている
		td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
		td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)
		#print "td_clip.data :",
		#print td_clip.data

		zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))).astype(np.float32))
		#zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))))
		loss = F.mean_squared_error(td_clip, zero_val)
		return loss, Q

	# Dataを保存
	def stockExperience(self, time,
						state, action, reward, state_dash,
						episode_end_flag):
		data_index = time % self.data_size

		if episode_end_flag is True:
			self.D[0][data_index] = state
			self.D[1][data_index] = action
			self.D[2][data_index] = reward
		else:
			self.D[0][data_index] = state
			self.D[1][data_index] = action
			self.D[2][data_index] = reward
			self.D[3][data_index] = state_dash
		self.D[4][data_index] = episode_end_flag

	# mini batch学習
	def experienceReplay(self, time):
		if self.initial_exploration < time:
			# Pick up replay_size number of samples from the Data
			if time < self.data_size:  # during the first sweep of the History Data
				replay_index = np.random.randint(0, time, (self.replay_size, 1))
			else:
				replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

			#s_replay = np.ndarray(shape=(self.replay_size, 100, 100), dtype=np.float32)
			s_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32)
			a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
			r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
			s_dash_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32)
			episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
			for i in xrange(self.replay_size):
				s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
				a_replay[i] = self.D[1][replay_index[i]]
				r_replay[i] = self.D[2][replay_index[i]]
				s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
				episode_end_replay[i] = self.D[4][replay_index[i]]
				if i == 0:
					print "s", s_replay[0][0], s_replay[0][1]*180/np.pi
					print "a", a_replay[0]
					print "s\'", s_dash_replay[0][0], s_dash_replay[0][1]*180/np.pi
					print "r", r_replay[0]

			s_replay = cuda.to_gpu(s_replay)
			s_dash_replay = cuda.to_gpu(s_dash_replay)

			# Gradient-based update
			self.optimizer.zero_grads()
			loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
			loss.backward()	### 逆伝播
			self.optimizer.update()	### 学習!!!!!!!!!, ネットワークの更新

	def Q_func(self, state):
		#h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs into [0.0 1.0]
		h1 = F.relu(self.model.l1(state))  # scale inputs into [0.0 1.0]
		#h2 = F.relu(self.model.l2(h1))
		#h3 = F.relu(self.model.l3(h2))
		#h4 = F.relu(self.model.l4(h3))
		#Q = self.model.l5(h4)
		Q = self.model.l2(h1)
		return Q

	def Q_func_target(self, state):
		#h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs into [0.0 1.0]
		h1 = F.relu(self.model_target.l1(state))  # scale inputs into [0.0 1.0]
		#h2 = F.relu(self.model_target.l2(h1))
		#h3 = F.relu(self.model_target.l3(h2))
		#h4 = F.relu(self.model_target.l4(h3))
		#Q = self.model.l5(h4)
		Q = self.model.l2(h1)
		return Q

	def e_greedy(self, state, epsilon):
		s = Variable(state)
		Q = self.Q_func(s)
		Q = Q.data

		if np.random.rand() < epsilon:
			#index_action = np.random.randint(0, self.num_of_actions)
			action = np.random.randint(0, self.num_of_actions)
			print "RANDOM"
		else:
			#index_action = np.argmax(Q.get())
			action = np.argmax(Q.get())
			print "GREEDY"
		#return self.index_to_action(index_action), Q
		return action, Q

	def action_to_vec(self, action, vec):
		#	"""	[ 0, 0],
		#		[ 0, 1],
		#		[ 0,-1],
		#		[ 1, 0],
		#		[ 1, 1],
		#		[ 1,-1],
		#		[-1, 0],
		#		[-1, 1],
		#		[-1,-1]]):"""
		#vec = Twist()
		if action == 3 or action == 4 or action == 5:
			#vec.linear.x += 0.1
			vec.linear.x = 0.3
		elif action == 6 or action == 7 or action == 8:
			#vec.linear.x -= 0.1
			vec.linear.x = -0.3
		else:
			vec.linear.x = 0.0

		if action == 1 or action == 4 or action == 7:
			#vec.angular.z += 0.1
			vec.angular.z = 0.3
		elif action == 2 or action == 5 or action == 8:
			#vec.angular.z -= 0.1
			vec.angular.z = -0.3
		else:
			vec.angular.z = 0.0

		if vec.linear.x > 1:
			vec.linear.x = 1
		elif vec.linear.x < -1:
			vec.linear.x = -1

		if vec.angular.z > 1:
			vec.angular.z = 1
		elif vec.angular.z < -1:
			vec.angular.z = -1

		return vec
Example #16
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**2  # Target update frequancy. original
    data_size = 10**5  # Data size of history. original
     
    #actions are 0 => do nothing, 1 -> buy, -1 sell
    def __init__(self, input_vector_length,enable_controller=[0, 1, 2]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"
        self.input_vector_length = input_vector_length

        print "Initializing DQN..."
#   Initialization for Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()
        
        #inputs --> 5 * 14 (with 10 temporality) + 5 (of last one hour) + 5 (of last 24 hour)
        print "Model Building"
        self.model = FunctionSet(
            l1=F.Linear(input_vector_length, 500),
            l2=F.Linear(500, 250),
            l3=F.Linear(250, 80),
            q_value=F.Linear(80, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 80),
                                               dtype=np.float32))
        ).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        #todo might want to normalize input, but for now I will do that outside this class 
        h1 = F.relu(self.model.l1(state))  
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"

        return self.index_to_action(index_action), Q
    
    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)
    
    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #17
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor

    def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        w = math.sqrt(2)  # MSRA scaling
        self.model = FunctionSet(
            conv1=F.Convolution2D(3,   64,  7, wscale=w, stride=2, pad=3),
            conv2_1_1=F.Convolution2D(64,   64,  1, wscale=w, stride=1),
            conv2_1_2=F.Convolution2D(64,   64,  3, wscale=w, stride=1, pad=1),
            conv2_1_3=F.Convolution2D(64,  256,  1, wscale=w, stride=1),
            conv2_1_ex=F.Convolution2D(64,  256,  1, wscale=w, stride=1),
            conv2_2_1=F.Convolution2D(256,   64,  1, wscale=w, stride=1),
            conv2_2_2=F.Convolution2D(64,   64,  3, wscale=w, stride=1, pad=1),
            conv2_2_3=F.Convolution2D(64,  256,  1, wscale=w, stride=1),
            conv2_3_1=F.Convolution2D(256,   64,  1, wscale=w, stride=1),
            conv2_3_2=F.Convolution2D(64,   64,  3, wscale=w, stride=1, pad=1),
            conv2_3_3=F.Convolution2D(64,  256,  1, wscale=w, stride=1),
            conv3_1_1=F.Convolution2D(256,  128,  1, wscale=w, stride=2),
            conv3_1_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_1_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_1_ex=F.Convolution2D(256,  512,  1, wscale=w, stride=2),
            conv3_2_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_2_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_2_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_3_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_3_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_3_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_4_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_4_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_4_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_5_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_5_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_5_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_6_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_6_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_6_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_7_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_7_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_7_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv3_8_1=F.Convolution2D(512,  128,  1, wscale=w, stride=1),
            conv3_8_2=F.Convolution2D(128,  128,  3, wscale=w, stride=1, pad=1),
            conv3_8_3=F.Convolution2D(128,  512,  1, wscale=w, stride=1),
            conv4_1_1=F.Convolution2D(512,  256,  1, wscale=w, stride=2),
            conv4_1_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_1_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_1_ex=F.Convolution2D(512,  1024,  1, wscale=w, stride=2),
            conv4_2_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_2_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_2_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_3_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_3_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_3_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_4_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_4_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_4_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_5_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_5_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_5_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_6_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_6_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_6_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_7_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_7_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_7_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_8_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_8_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_8_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_9_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_9_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_9_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_10_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_10_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_10_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_11_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_11_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_11_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_12_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_12_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_12_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_13_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_13_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_13_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_14_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_14_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_14_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_15_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_15_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_15_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_16_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_16_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_16_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_17_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_17_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_17_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_18_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_18_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_18_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_19_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_19_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_19_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_20_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_20_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_20_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_21_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_21_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_21_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_22_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_22_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_22_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_23_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_23_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_23_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_24_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_24_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_24_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_25_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_25_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_25_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_26_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_26_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_26_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_27_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_27_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_27_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_28_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_28_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_28_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_29_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_29_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_29_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_30_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_30_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_30_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_31_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_31_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_31_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_32_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_32_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_32_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_33_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_33_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_33_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_34_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_34_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_34_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_35_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_35_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_35_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv4_36_1=F.Convolution2D(1024,  256,  1, wscale=w, stride=1),
            conv4_36_2=F.Convolution2D(256,  256,  3, wscale=w, stride=1, pad=1),
            conv4_36_3=F.Convolution2D(256,  1024,  1, wscale=w, stride=1),
            conv5_1_1=F.Convolution2D(1024,  512,  1, wscale=w, stride=2),
            conv5_1_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_1_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            conv5_1_ex=F.Convolution2D(1024,  2048,  1, wscale=w, stride=2),
            conv5_2_1=F.Convolution2D(2048,  512,  1, wscale=w, stride=1),
            conv5_2_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_2_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            conv5_3_1=F.Convolution2D(2048,  512,  1, wscale=w, stride=1),
            conv5_3_2=F.Convolution2D(512,  512,  3, wscale=w, stride=1, pad=1),
            conv5_3_3=F.Convolution2D(512,  2048,  1, wscale=w, stride=1),
            q_value=F.Linear(2048, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 2048),
                                               dtype=np.float32))
        )

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((num_of_batch, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def Q_func(self, state):
        h = F.relu(self.model.conv1(state))
        h = F.max_pooling_2d(h, 3, stride=2)

        h_rem = self.model.conv2_1_ex(h)
        h = F.relu(self.model.conv2_1_1(h))
        h = F.relu(self.model.conv2_1_2(h))
        h = self.model.conv2_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv2_2_1(h))
        h = F.relu(self.model.conv2_2_2(h))
        h = self.model.conv2_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv2_3_1(h))
        h = F.relu(self.model.conv2_3_2(h))
        h = self.model.conv2_3_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model.conv3_1_ex(h)
        h = F.relu(self.model.conv3_1_1(h))
        h = F.relu(self.model.conv3_1_2(h))
        h = self.model.conv3_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_2_1(h))
        h = F.relu(self.model.conv3_2_2(h))
        h = self.model.conv3_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_3_1(h))
        h = F.relu(self.model.conv3_3_2(h))
        h = self.model.conv3_3_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_4_1(h))
        h = F.relu(self.model.conv3_4_2(h))
        h = self.model.conv3_4_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_5_1(h))
        h = F.relu(self.model.conv3_5_2(h))
        h = self.model.conv3_5_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_6_1(h))
        h = F.relu(self.model.conv3_6_2(h))
        h = self.model.conv3_6_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_7_1(h))
        h = F.relu(self.model.conv3_7_2(h))
        h = self.model.conv3_7_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv3_8_1(h))
        h = F.relu(self.model.conv3_8_2(h))
        h = self.model.conv3_8_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model.conv4_1_ex(h)
        h = F.relu(self.model.conv4_1_1(h))
        h = F.relu(self.model.conv4_1_2(h))
        h = self.model.conv4_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_2_1(h))
        h = F.relu(self.model.conv4_2_2(h))
        h = self.model.conv4_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_3_1(h))
        h = F.relu(self.model.conv4_3_2(h))
        h = self.model.conv4_3_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_4_1(h))
        h = F.relu(self.model.conv4_4_2(h))
        h = self.model.conv4_4_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_5_1(h))
        h = F.relu(self.model.conv4_5_2(h))
        h = self.model.conv4_5_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_6_1(h))
        h = F.relu(self.model.conv4_6_2(h))
        h = self.model.conv4_6_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_7_1(h))
        h = F.relu(self.model.conv4_7_2(h))
        h = self.model.conv4_7_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_8_1(h))
        h = F.relu(self.model.conv4_8_2(h))
        h = self.model.conv4_8_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_9_1(h))
        h = F.relu(self.model.conv4_9_2(h))
        h = self.model.conv4_9_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_10_1(h))
        h = F.relu(self.model.conv4_10_2(h))
        h = self.model.conv4_10_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_11_1(h))
        h = F.relu(self.model.conv4_11_2(h))
        h = self.model.conv4_11_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_12_1(h))
        h = F.relu(self.model.conv4_12_2(h))
        h = self.model.conv4_12_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_13_1(h))
        h = F.relu(self.model.conv4_13_2(h))
        h = self.model.conv4_13_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_14_1(h))
        h = F.relu(self.model.conv4_14_2(h))
        h = self.model.conv4_14_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_15_1(h))
        h = F.relu(self.model.conv4_15_2(h))
        h = self.model.conv4_15_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_16_1(h))
        h = F.relu(self.model.conv4_16_2(h))
        h = self.model.conv4_16_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_17_1(h))
        h = F.relu(self.model.conv4_17_2(h))
        h = self.model.conv4_17_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_18_1(h))
        h = F.relu(self.model.conv4_18_2(h))
        h = self.model.conv4_18_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_19_1(h))
        h = F.relu(self.model.conv4_19_2(h))
        h = self.model.conv4_19_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_20_1(h))
        h = F.relu(self.model.conv4_20_2(h))
        h = self.model.conv4_20_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_21_1(h))
        h = F.relu(self.model.conv4_21_2(h))
        h = self.model.conv4_21_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_22_1(h))
        h = F.relu(self.model.conv4_22_2(h))
        h = self.model.conv4_22_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_23_1(h))
        h = F.relu(self.model.conv4_23_2(h))
        h = self.model.conv4_23_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_24_1(h))
        h = F.relu(self.model.conv4_24_2(h))
        h = self.model.conv4_24_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_25_1(h))
        h = F.relu(self.model.conv4_25_2(h))
        h = self.model.conv4_25_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_26_1(h))
        h = F.relu(self.model.conv4_26_2(h))
        h = self.model.conv4_26_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_27_1(h))
        h = F.relu(self.model.conv4_27_2(h))
        h = self.model.conv4_27_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_28_1(h))
        h = F.relu(self.model.conv4_28_2(h))
        h = self.model.conv4_28_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_29_1(h))
        h = F.relu(self.model.conv4_29_2(h))
        h = self.model.conv4_29_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_30_1(h))
        h = F.relu(self.model.conv4_30_2(h))
        h = self.model.conv4_30_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_31_1(h))
        h = F.relu(self.model.conv4_31_2(h))
        h = self.model.conv4_31_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_32_1(h))
        h = F.relu(self.model.conv4_32_2(h))
        h = self.model.conv4_32_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_33_1(h))
        h = F.relu(self.model.conv4_33_2(h))
        h = self.model.conv4_33_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_34_1(h))
        h = F.relu(self.model.conv4_34_2(h))
        h = self.model.conv4_34_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_35_1(h))
        h = F.relu(self.model.conv4_35_2(h))
        h = self.model.conv4_35_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv4_36_1(h))
        h = F.relu(self.model.conv4_36_2(h))
        h = self.model.conv4_36_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model.conv5_1_ex(h)
        h = F.relu(self.model.conv5_1_1(h))
        h = F.relu(self.model.conv5_1_2(h))
        h = self.model.conv5_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv5_2_1(h))
        h = F.relu(self.model.conv5_2_2(h))
        h = self.model.conv5_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model.conv5_3_1(h))
        h = F.relu(self.model.conv5_3_2(h))
        h = self.model.conv5_3_3(h)
        h = F.relu(h + h_rem)

        h = F.average_pooling_2d(h, 7)
        Q = self.model.q_value(h)
        return Q

    def Q_func_target(self, state):
        h = F.relu(self.model_target.conv1(state))
        h = F.max_pooling_2d(h, 3, stride=2)

        h_rem = self.model_target.conv2_1_ex(h)
        h = F.relu(self.model_target.conv2_1_1(h))
        h = F.relu(self.model_target.conv2_1_2(h))
        h = self.model_target.conv2_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv2_2_1(h))
        h = F.relu(self.model_target.conv2_2_2(h))
        h = self.model_target.conv2_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv2_3_1(h))
        h = F.relu(self.model_target.conv2_3_2(h))
        h = self.model_target.conv2_3_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model_target.conv3_1_ex(h)
        h = F.relu(self.model_target.conv3_1_1(h))
        h = F.relu(self.model_target.conv3_1_2(h))
        h = self.model_target.conv3_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_2_1(h))
        h = F.relu(self.model_target.conv3_2_2(h))
        h = self.model_target.conv3_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_3_1(h))
        h = F.relu(self.model_target.conv3_3_2(h))
        h = self.model_target.conv3_3_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_4_1(h))
        h = F.relu(self.model_target.conv3_4_2(h))
        h = self.model_target.conv3_4_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_5_1(h))
        h = F.relu(self.model_target.conv3_5_2(h))
        h = self.model_target.conv3_5_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_6_1(h))
        h = F.relu(self.model_target.conv3_6_2(h))
        h = self.model_target.conv3_6_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_7_1(h))
        h = F.relu(self.model_target.conv3_7_2(h))
        h = self.model_target.conv3_7_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv3_8_1(h))
        h = F.relu(self.model_target.conv3_8_2(h))
        h = self.model_target.conv3_8_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model_target.conv4_1_ex(h)
        h = F.relu(self.model_target.conv4_1_1(h))
        h = F.relu(self.model_target.conv4_1_2(h))
        h = self.model_target.conv4_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_2_1(h))
        h = F.relu(self.model_target.conv4_2_2(h))
        h = self.model_target.conv4_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_3_1(h))
        h = F.relu(self.model_target.conv4_3_2(h))
        h = self.model_target.conv4_3_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_4_1(h))
        h = F.relu(self.model_target.conv4_4_2(h))
        h = self.model_target.conv4_4_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_5_1(h))
        h = F.relu(self.model_target.conv4_5_2(h))
        h = self.model_target.conv4_5_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_6_1(h))
        h = F.relu(self.model_target.conv4_6_2(h))
        h = self.model_target.conv4_6_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_7_1(h))
        h = F.relu(self.model_target.conv4_7_2(h))
        h = self.model_target.conv4_7_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_8_1(h))
        h = F.relu(self.model_target.conv4_8_2(h))
        h = self.model_target.conv4_8_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_9_1(h))
        h = F.relu(self.model_target.conv4_9_2(h))
        h = self.model_target.conv4_9_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_10_1(h))
        h = F.relu(self.model_target.conv4_10_2(h))
        h = self.model_target.conv4_10_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_11_1(h))
        h = F.relu(self.model_target.conv4_11_2(h))
        h = self.model_target.conv4_11_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_12_1(h))
        h = F.relu(self.model_target.conv4_12_2(h))
        h = self.model_target.conv4_12_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_13_1(h))
        h = F.relu(self.model_target.conv4_13_2(h))
        h = self.model_target.conv4_13_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_14_1(h))
        h = F.relu(self.model_target.conv4_14_2(h))
        h = self.model_target.conv4_14_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_15_1(h))
        h = F.relu(self.model_target.conv4_15_2(h))
        h = self.model_target.conv4_15_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_16_1(h))
        h = F.relu(self.model_target.conv4_16_2(h))
        h = self.model_target.conv4_16_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_17_1(h))
        h = F.relu(self.model_target.conv4_17_2(h))
        h = self.model_target.conv4_17_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_18_1(h))
        h = F.relu(self.model_target.conv4_18_2(h))
        h = self.model_target.conv4_18_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_19_1(h))
        h = F.relu(self.model_target.conv4_19_2(h))
        h = self.model_target.conv4_19_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_20_1(h))
        h = F.relu(self.model_target.conv4_20_2(h))
        h = self.model_target.conv4_20_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_21_1(h))
        h = F.relu(self.model_target.conv4_21_2(h))
        h = self.model_target.conv4_21_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_22_1(h))
        h = F.relu(self.model_target.conv4_22_2(h))
        h = self.model_target.conv4_22_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_23_1(h))
        h = F.relu(self.model_target.conv4_23_2(h))
        h = self.model_target.conv4_23_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_24_1(h))
        h = F.relu(self.model_target.conv4_24_2(h))
        h = self.model_target.conv4_24_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_25_1(h))
        h = F.relu(self.model_target.conv4_25_2(h))
        h = self.model_target.conv4_25_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_26_1(h))
        h = F.relu(self.model_target.conv4_26_2(h))
        h = self.model_target.conv4_26_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_27_1(h))
        h = F.relu(self.model_target.conv4_27_2(h))
        h = self.model_target.conv4_27_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_28_1(h))
        h = F.relu(self.model_target.conv4_28_2(h))
        h = self.model_target.conv4_28_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_29_1(h))
        h = F.relu(self.model_target.conv4_29_2(h))
        h = self.model_target.conv4_29_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_30_1(h))
        h = F.relu(self.model_target.conv4_30_2(h))
        h = self.model_target.conv4_30_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_31_1(h))
        h = F.relu(self.model_target.conv4_31_2(h))
        h = self.model_target.conv4_31_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_32_1(h))
        h = F.relu(self.model_target.conv4_32_2(h))
        h = self.model_target.conv4_32_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_33_1(h))
        h = F.relu(self.model_target.conv4_33_2(h))
        h = self.model_target.conv4_33_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_34_1(h))
        h = F.relu(self.model_target.conv4_34_2(h))
        h = self.model_target.conv4_34_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_35_1(h))
        h = F.relu(self.model_target.conv4_35_2(h))
        h = self.model_target.conv4_35_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv4_36_1(h))
        h = F.relu(self.model_target.conv4_36_2(h))
        h = self.model_target.conv4_36_3(h)
        h = F.relu(h + h_rem)

        h_rem = self.model_target.conv5_1_ex(h)
        h = F.relu(self.model_target.conv5_1_1(h))
        h = F.relu(self.model_target.conv5_1_2(h))
        h = self.model_target.conv5_1_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv5_2_1(h))
        h = F.relu(self.model_target.conv5_2_2(h))
        h = self.model_target.conv5_2_3(h)
        h = F.relu(h + h_rem)
        h_rem = h
        h = F.relu(self.model_target.conv5_3_1(h))
        h = F.relu(self.model_target.conv5_3_2(h))
        h = self.model_target.conv5_3_3(h)
        h = F.relu(h + h_rem)

        h = F.average_pooling_2d(h, 7)
        Q = self.model_target.q_value(h)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action)

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #18
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."
#	Initialization for Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)),
            l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)),
            l3=F.Linear(2592, 256),
            q_value=F.Linear(256, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32))
        ).to_gpu()

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        max_Q_dash_ = self.Q_func(s_dash)
        tmp = list(map(np.max, max_Q_dash_.data.get()))
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])
            target[i, self.action_to_index(action[i])] = tmp_

        loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0, 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        Q = self.model.q_value(h3)
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"

        return self.index_to_action(index_action), Q

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5 #10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."

        print "Model Building"
        self.CNN_model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            )

        self.model = FunctionSet(
            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
            q_value=F.Linear(512, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 512),
                                               dtype=np.float32))
        ).to_gpu()
        
        d = 'elite/'
        
        self.CNN_model.l1.W.data = np.load(d+'l1_W.npy')#.astype(np.float32)
        self.CNN_model.l1.b.data = np.load(d+'l1_b.npy')#.astype(np.float32)
        self.CNN_model.l2.W.data = np.load(d+'l2_W.npy')#.astype(np.float32)
        self.CNN_model.l2.b.data = np.load(d+'l2_b.npy')#.astype(np.float32)
        self.CNN_model.l3.W.data = np.load(d+'l3_W.npy')#.astype(np.float32)
        self.CNN_model.l3.b.data = np.load(d+'l3_b.npy')#.astype(np.float32)

        self.CNN_model = self.CNN_model.to_gpu()
        self.CNN_model_target = copy.deepcopy(self.CNN_model)
        self.model_target = copy.deepcopy(self.model)


        
        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool),
                  np.zeros((self.data_size, 1), dtype=np.uint8)]
        


    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, lstm_reward, state_dash,
                        episode_end_flag, ale_reward):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[5][data_index] = ale_reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[3][data_index] = state_dash
            self.D[5][data_index] = ale_reward
        self.D[4][data_index] = episode_end_flag


    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.CNN_model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        #test now
        #print h3.data.shape
        Q = self.model.q_value(h4)
        return Q 

    
    def Q_func_LSTM(self, state):
        h1 = F.relu(self.CNN_model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        return h3.data.get()
    
        
    def Q_func_target(self, state):
        h1 = F.relu(self.CNN_model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model_target.l2(h1))
        h3 = F.relu(self.CNN_model_target.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        Q = self.model_target.q_value(h4)
        return Q
    
    def LSTM_reward(self, lstm_out, state_next):
        lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2))
        return lstm_reward

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #20
0
# Neural net architecture
# ニューラルネットの構造
def forward(x_data, y_data, train=True):
    x, t = Variable(x_data), Variable(y_data)

    h1 = F.dropout(F.relu(model.l1(x)), train=train)
    h2 = F.dropout(F.relu(model.l2(h1)), train=train)
    y = model.l3(h2)
    # 多クラス分類なので誤差関数としてソフトマックス関数の
    # 交差エントロピー関数を用いて、誤差を導出
    return F.softmax_cross_entropy(y, t), F.accuracy(y, t)


# Setup optimizer
optimizer = optimizers.Adam()
optimizer.setup(model.collect_parameters())

train_loss = []
train_acc = []
test_loss = []
test_acc = []

l1_W = []
l2_W = []
l3_W = []

# Learning loop
for epoch in xrange(1, n_epoch + 1):
    print 'epoch', epoch

    # training
Example #21
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 4

    def __init__(self, use_gpu, num_of_action_type, num_of_pad, dim):
        self.use_gpu = use_gpu
        self.num_of_action_type = num_of_action_type
        self.num_of_pad = num_of_pad
        self.num_of_actions = num_of_action_type * num_of_pad
        self.dim = dim

        print("Initializing Q-Network...\n")

        self.q_net_filename = "q_net.pickle"
        if os.path.exists(self.q_net_filename):
            print("Loading Q-Network Model...\n")
            self.model = self.load_model()
        else:
            hidden_dim = 256
            self.model = FunctionSet(l4=F.Linear(self.dim * self.hist_size,
                                                 hidden_dim,
                                                 wscale=np.sqrt(2)),
                                     q_value=F.Linear(
                                         hidden_dim,
                                         self.num_of_actions,
                                         initialW=np.zeros(
                                             (self.num_of_actions, hidden_dim),
                                             dtype=np.float32)))

        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, self.num_of_pad), dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def load_model(self):
        return pickle.load(open(self.q_net_filename, 'rb'))

    def dump_model(self):
        pickle.dump(self.model, open(self.q_net_filename, 'wb'))

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_indexes = self.action_to_indexes(action[i])
            for index in action_indexes:
                target[i, index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions),
                            dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time, state, action, reward, state_dash,
                         episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                         self.dim),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, self.num_of_pad),
                                  dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                              self.dim),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state))
        q = self.model.q_value(h4 / 255.0)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        q = self.model_target.q_value(h4)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            print(" Random")
            action = [
                np.random.randint(0, self.num_of_action_type)
                for i in range(self.num_of_pad)
            ]
        else:
            print("#Greedy")
            if self.use_gpu >= 0:
                action = self.indexes_to_action([
                    np.argmax(sq)
                    for sq in np.split(q.get()[0], self.num_of_pad)
                ])
            else:
                action = self.indexes_to_action(
                    [np.argmax(sq) for sq in np.split(q[0], self.num_of_pad)])
        return action, q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def indexes_to_action(self, indexs_of_action):
        return [index % self.num_of_action_type for index in indexs_of_action]

    def action_to_indexes(self, action):
        return [
            self.num_of_action_type * i + a for (i, a) in enumerate(action)
        ]
Example #22
0
def Main():
  import argparse
  import numpy as np
  from chainer import cuda, Variable, FunctionSet, optimizers
  import chainer.functions  as F

  parser = argparse.ArgumentParser(description='Chainer example: regression')
  parser.add_argument('--gpu', '-g', default=-1, type=int,
                      help='GPU ID (negative value indicates CPU)')
  args = parser.parse_args()

  batchsize = 10
  n_epoch   = NEpoch
  n_units   = 300  #TEST

  # Prepare dataset
  data_x, data_y = LoadData()
  batchsize= max(1,min(batchsize, len(data_y)/20))  #TEST: adjust batchsize
  #dx2,dy2=GenData(300, noise=0.0); data_x.extend(dx2); data_y.extend(dy2)
  data = np.array(data_x).astype(np.float32)
  target = np.array(data_y).astype(np.int32)  #DIFF_REG

  N= len(data) #batchsize * 30
  x_train= data
  y_train= target

  #For test:
  mi,ma,me= GetStat(data_x)
  f_reduce=lambda xa:[xa[0],xa[1]]
  f_repair=lambda xa:[xa[0],xa[1]]
  nt= 20+1
  N_test= nt*nt
  x_test= np.array(sum([[f_repair([x1,x2]) for x2 in FRange1(f_reduce(mi)[1],f_reduce(ma)[1],nt)] for x1 in FRange1(f_reduce(mi)[0],f_reduce(ma)[0],nt)],[])).astype(np.float32)
  y_test= np.array([0.0 for x in x_test]).astype(np.int32)  #DIFF_REG
  #No true test data (just for plotting)

  print 'Num of samples for train:',len(y_train),'batchsize:',batchsize
  # Dump data for plot:
  DumpData('/tmp/nn/smpl_train.dat', x_train, [[y] for y in y_train], f_reduce)  #DIFF_REG

  # Prepare multi-layer perceptron model
  model = FunctionSet(l1=F.Linear(2, n_units),
                      l2=F.Linear(n_units, n_units),
                      l3=F.Linear(n_units, 3))
  #TEST: Random bias initialization
  #, bias=Rand()
  #model.l1.b[:]= [Rand() for k in range(n_units)]
  #model.l2.b[:]= [Rand() for k in range(n_units)]
  #model.l3.b[:]= [Rand() for k in range(1)]
  #print model.l2.__dict__
  if args.gpu >= 0:
    cuda.init(args.gpu)
    model.to_gpu()

  # Neural net architecture
  def forward(x_data, y_data, train=True):
    #train= False  #TEST: Turn off dropout
    dratio= 0.2  #0.5  #TEST: Dropout ratio
    x, t = Variable(x_data), Variable(y_data)
    h1 = F.dropout(F.relu(model.l1(x)),  ratio=dratio, train=train)
    h2 = F.dropout(F.relu(model.l2(h1)), ratio=dratio, train=train)
    #h1 = F.dropout(F.leaky_relu(model.l1(x),slope=0.2),  ratio=dratio, train=train)
    #h2 = F.dropout(F.leaky_relu(model.l2(h1),slope=0.2), ratio=dratio, train=train)
    #h1 = F.dropout(F.sigmoid(model.l1(x)),  ratio=dratio, train=train)
    #h2 = F.dropout(F.sigmoid(model.l2(h1)), ratio=dratio, train=train)
    #h1 = F.dropout(F.tanh(model.l1(x)),  ratio=dratio, train=train)
    #h2 = F.dropout(F.tanh(model.l2(h1)), ratio=dratio, train=train)
    #h1 = F.dropout(model.l1(x),  ratio=dratio, train=train)
    #h2 = F.dropout(model.l2(h1), ratio=dratio, train=train)
    #h1 = F.relu(model.l1(x))
    #h2 = F.relu(model.l2(h1))
    #h1 = model.l1(x)
    #h2 = model.l2(h1)
    y  = model.l3(h2)
    #return F.mean_squared_error(y, t), y
    return F.softmax_cross_entropy(y, t), F.softmax(y)  #DIFF_REG

  # Setup optimizer
  optimizer = optimizers.AdaDelta(rho=0.9)
  #optimizer = optimizers.AdaGrad(lr=0.5)
  #optimizer = optimizers.RMSprop()
  #optimizer = optimizers.MomentumSGD()
  #optimizer = optimizers.SGD(lr=0.8)
  optimizer.setup(model.collect_parameters())

  # Learning loop
  for epoch in xrange(1, n_epoch+1):
    print 'epoch', epoch

    # training
    perm = np.random.permutation(N)
    sum_loss = 0

    for i in xrange(0, N, batchsize):
      x_batch = x_train[perm[i:i+batchsize]]
      y_batch = y_train[perm[i:i+batchsize]]
      if args.gpu >= 0:
        x_batch = cuda.to_gpu(x_batch)
        y_batch = cuda.to_gpu(y_batch)

      optimizer.zero_grads()
      loss, pred = forward(x_batch, y_batch)
      loss.backward()  #Computing gradients
      optimizer.update()

      sum_loss += float(cuda.to_cpu(loss.data)) * batchsize

    print 'train mean loss={}'.format(
        sum_loss / N)


    if epoch%10==0:
      #'''
      # testing all data
      preds = []
      x_batch = x_test[:]
      y_batch = y_test[:]
      if args.gpu >= 0:
        x_batch = cuda.to_gpu(x_batch)
        y_batch = cuda.to_gpu(y_batch)
      loss, pred = forward(x_batch, y_batch, train=False)
      preds = cuda.to_cpu(pred.data)
      sum_loss = float(cuda.to_cpu(loss.data)) * len(y_test)
      #'''

      print 'test  mean loss={}'.format(
          sum_loss / N_test)

      # Dump data for plot:
      y_pred= [[y.index(max(y))]+y for y in preds.tolist()]  #DIFF_REG
      DumpData('/tmp/nn/nn_test%04i.dat'%epoch, x_test, y_pred, f_reduce, lb=nt+1)
Example #23
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  # original: 4

    def __init__(self, use_gpu, enable_controller, dim, epsilon, epsilon_delta, min_eps):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim
        self.epsilon = epsilon
        self.epsilon_delta = epsilon_delta
        self.min_eps = min_eps
        self.time = 0

        app_logger.info("Initializing Q-Network...")

        hidden_dim = 256
        self.model = FunctionSet(
            l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)),
            q_value=F.Linear(hidden_dim, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, hidden_dim),
                                               dtype=np.float32))
        )
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        q = self.model.q_value(h4)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        q = self.model_target.q_value(h4)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            app_logger.info(" Random")
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            app_logger.info("#Greedy")
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)

    def start(self, feature):
        self.state = np.zeros((self.hist_size, self.dim), dtype=np.uint8)
        self.state[0] = feature

        state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Generate an Action e-greedy
        action, q_now = self.e_greedy(state_, self.epsilon)
        return_action = action

        return return_action

    def update_model(self, replayed_experience):
        if replayed_experience[0]:
            self.optimizer.zero_grads()
            loss, _ = self.forward(replayed_experience[1], replayed_experience[2],
                                        replayed_experience[3], replayed_experience[4], replayed_experience[5])
            loss.backward()
            self.optimizer.update()

        # Target model update
        if replayed_experience[0] and np.mod(self.time, self.target_model_update_freq) == 0:
            app_logger.info("Model Updated")
            self.target_model_update()

        self.time += 1
        app_logger.info("step: {}".format(self.time))

    def step(self, features):
        if self.hist_size == 4:
            self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], features], dtype=np.uint8)
        elif self.hist_size == 2:
            self.state = np.asanyarray([self.state[1], features], dtype=np.uint8)
        elif self.hist_size == 1:
            self.state = np.asanyarray([features], dtype=np.uint8)
        else:
            app_logger.error("self.DQN.hist_size err")

        state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.initial_exploration < self.time:
            self.epsilon -= self.epsilon_delta
            if self.epsilon < self.min_eps:
                self.epsilon = self.min_eps
            eps = self.epsilon
        else:  # Initial Exploation Phase
            app_logger.info("Initial Exploration : {}/{} steps".format(self.time, self.initial_exploration))
            eps = 1.0

        # Generate an Action by e-greedy action selection
        action, q_now = self.e_greedy(state_, eps)

        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        return action, eps, q_max
Example #24
0
class ConvolutionalDenoisingAutoencoder():
    def __init__(self,
                 imgsize,
                 n_in_channels,
                 n_out_channels,
                 ksize,
                 stride=1,
                 pad=0,
                 use_cuda=False):
        self.model = FunctionSet(
            encode=F.Convolution2D(n_in_channels, n_out_channels, ksize,
                                   stride, pad),
            decode=F.Linear(
                n_out_channels * (math.floor(
                    (imgsize + 2 * pad - ksize) / stride) + 1)**2,
                n_in_channels * imgsize**2))
        self.use_cuda = use_cuda

        if self.use_cuda:
            self.model.to_gpu()

        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self.model.collect_parameters())

    def encode(self, x_var):
        return F.sigmoid(self.model.encode(x_var))

    def decode(self, x_var):
        return self.model.decode(x_var)

    def predict(self, x_data):
        if self.use_cuda:
            x_data = cuda.to_gpu(x_data)
        x = Variable(x_data)
        p = self.encode(x)
        if self.use_cuda:
            return cuda.to_cpu(p.data)
        else:
            return p.data

    def cost(self, x_data):
        x = Variable(x_data)
        t = Variable(
            x_data.reshape(x_data.shape[0], x_data.shape[1] * x_data.shape[2] *
                           x_data.shape[3]))
        h = F.dropout(x)
        h = self.encode(h)
        y = self.decode(h)
        return F.mean_squared_error(y, t)

    def train(self, x_data):
        if self.use_cuda:
            x_data = cuda.to_gpu(x_data)
        self.optimizer.zero_grads()
        loss = self.cost(x_data)
        loss.backward()
        self.optimizer.update()
        if self.use_cuda:
            return float(cuda.to_cpu(loss.data))
        else:
            return loss.data

    def test(self, x_data):
        if self.use_cuda:
            x_data = cuda.to_gpu(x_data)
        loss = self.cost(x_data)
        return float(cuda.to_cpu(loss.data))
# Prepare multi-layer perceptron model
model = FunctionSet(l1=F.Linear(784, n_units),
	            l2=F.Linear(n_units, n_units),
	            l3=F.Linear(n_units, 10))
# Neural net architecture
def forward(x_data, y_data, train=True):
    x, t = Variable(x_data), Variable(y_data)
    h1 = F.dropout(F.relu(model.l1(x)),  train=train)
    h2 = F.dropout(F.relu(model.l2(h1)), train=train)
    y  = model.l3(h2)
    return F.softmax_cross_entropy(y, t), F.accuracy(y, t)

# Setup optimizer
optimizer = optimizers.Adam()
optimizer.setup(model.collect_parameters())
# Learning loop
for epoch in xrange(1, n_epoch+1):
    print 'epoch', epoch
    # training
    perm = np.random.permutation(N)
    sum_accuracy = 0
    sum_loss = 0
    for i in xrange(0, N, batchsize):
        x_batch = x_train[perm[i:i+batchsize]]
        y_batch = y_train[perm[i:i+batchsize]]
        optimizer.zero_grads()
        loss, acc = forward(x_batch, y_batch)
	loss.backward()
        optimizer.update()
        sum_loss     += float(loss.data) * batchsize
Example #26
0
class DQN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100  #10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  #10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Pong"

        print "Initializing DQN..."

        print "Model Building"
        self.CNN_model = FunctionSet(
            l1=F.Convolution2D(4,
                               32,
                               ksize=8,
                               stride=4,
                               nobias=False,
                               wscale=np.sqrt(2)),
            l2=F.Convolution2D(32,
                               64,
                               ksize=4,
                               stride=2,
                               nobias=False,
                               wscale=np.sqrt(2)),
            l3=F.Convolution2D(64,
                               64,
                               ksize=3,
                               stride=1,
                               nobias=False,
                               wscale=np.sqrt(2)),
        )

        self.model = FunctionSet(
            l4=F.Linear(3136, 512, wscale=np.sqrt(2)),
            q_value=F.Linear(512,
                             self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, 512),
                                               dtype=np.float32))).to_gpu()

        d = 'elite/'

        self.CNN_model.l1.W.data = np.load(d +
                                           'l1_W.npy')  #.astype(np.float32)
        self.CNN_model.l1.b.data = np.load(d +
                                           'l1_b.npy')  #.astype(np.float32)
        self.CNN_model.l2.W.data = np.load(d +
                                           'l2_W.npy')  #.astype(np.float32)
        self.CNN_model.l2.b.data = np.load(d +
                                           'l2_b.npy')  #.astype(np.float32)
        self.CNN_model.l3.W.data = np.load(d +
                                           'l3_W.npy')  #.astype(np.float32)
        self.CNN_model.l3.b.data = np.load(d +
                                           'l3_b.npy')  #.astype(np.float32)

        self.CNN_model = self.CNN_model.to_gpu()
        self.CNN_model_target = copy.deepcopy(self.CNN_model)
        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.D = [
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool),
            np.zeros((self.data_size, 1), dtype=np.uint8)
        ]

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.Q_func_target(s_dash)  # Q(s',*)
        tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = Variable(
            cuda.to_gpu(
                np.zeros((self.replay_size, self.num_of_actions),
                         dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time, state, action, lstm_reward, state_dash,
                        episode_end_flag, ale_reward):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[5][data_index] = ale_reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = lstm_reward
            self.D[3][data_index] = state_dash
            self.D[5][data_index] = ale_reward
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        h1 = F.relu(self.CNN_model.l1(state /
                                      254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        #test now
        #print h3.data.shape
        Q = self.model.q_value(h4)
        return Q

    def Q_func_LSTM(self, state):
        h1 = F.relu(self.CNN_model.l1(state /
                                      254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model.l2(h1))
        h3 = F.relu(self.CNN_model.l3(h2))
        return h3.data.get()

    def Q_func_target(self, state):
        h1 = F.relu(self.CNN_model_target.l1(
            state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.CNN_model_target.l2(h1))
        h3 = F.relu(self.CNN_model_target.l3(h2))
        h4 = F.relu(self.model.l4(h3))
        Q = self.model_target.q_value(h4)
        return Q

    def LSTM_reward(self, lstm_out, state_next):
        lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2))
        return lstm_reward

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  #original: 4

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        #hidden_dim = 256
        hidden_dim128 = 128

        self.model = FunctionSet(
            l4=F.Linear(self.dim * self.hist_size,
                        hidden_dim128,
                        wscale=np.sqrt(2)),
            l5=F.Linear(self.dim * self.hist_size,
                        hidden_dim128,
                        wscale=np.sqrt(2)),
            l6=F.Linear(hidden_dim128,
                        1,
                        wscale=np.sqrt(2),
                        initialW=np.zeros((1, hidden_dim128),
                                          dtype=np.float32)),  #V(s,a)
            l7=F.Linear(hidden_dim128,
                        self.num_of_actions,
                        wscale=np.sqrt(2),
                        initialW=np.zeros((self.num_of_actions, hidden_dim128),
                                          dtype=np.float32)),  #A(a)         
            q_value=DN_out.DN_out(1,
                                  self.num_of_actions,
                                  self.num_of_actions,
                                  nobias=True))
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions),
                            dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time, state, action, reward, state_dash,
                         episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                         self.dim),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                              self.dim),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        h5 = F.relu(self.model.l5(state / 255.0))
        #h6 = F.relu(self.model.l6(h4))
        #h7 = relu_l7.relu(self.model.l7(h5))
        h6 = self.model.l6(h4)
        h7 = self.model.l7(h5)
        q = self.model.q_value(h6, h7)
        return q

    def q_func_target(self, state):
        #h4 = F.relu(self.model_target.l4(state / 255.0))
        #q = self.model_target.q_value(h4)
        h4 = F.relu(self.model_target.l4(state / 255.0))
        h5 = F.relu(self.model_target.l5(state / 255.0))
        #h6 = F.relu(self.model_target.l6(h4))
        #h7 = relu_l7.relu(self.model_target.l7(h5))
        h6 = self.model_target.l6(h4)
        h7 = self.model_target.l7(h5)
        q = self.model_target.q_value(h6, h7)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #28
0
def Main():
    import argparse
    import numpy as np
    from sklearn.datasets import load_diabetes
    from chainer import cuda, Variable, FunctionSet, optimizers
    import chainer.functions as F

    parser = argparse.ArgumentParser(description='Chainer example: regression')
    parser.add_argument('--gpu',
                        '-g',
                        default=-1,
                        type=int,
                        help='GPU ID (negative value indicates CPU)')
    args = parser.parse_args()

    batchsize = 13
    n_epoch = 100
    n_units = 30

    # Prepare dataset
    print 'fetch diabetes dataset'
    diabetes = load_diabetes()
    data = diabetes['data'].astype(np.float32)
    target = diabetes['target'].astype(np.float32).reshape(
        len(diabetes['target']), 1)

    N = batchsize * 30  #Number of training data
    x_train, x_test = np.split(data, [N])
    y_train, y_test = np.split(target, [N])
    N_test = y_test.size

    print 'Num of samples for train:', len(y_train)
    print 'Num of samples for test:', len(y_test)
    # Dump data for plot:
    fp1 = file('/tmp/smpl_train.dat', 'w')
    for x, y in zip(x_train, y_train):
        fp1.write('%s #%i# %s\n' %
                  (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y))))
    fp1.close()
    # Dump data for plot:
    fp1 = file('/tmp/smpl_test.dat', 'w')
    for x, y in zip(x_test, y_test):
        fp1.write('%s #%i# %s\n' %
                  (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y))))
    fp1.close()

    # Prepare multi-layer perceptron model
    model = FunctionSet(l1=F.Linear(10, n_units),
                        l2=F.Linear(n_units, n_units),
                        l3=F.Linear(n_units, 1))
    if args.gpu >= 0:
        cuda.init(args.gpu)
        model.to_gpu()

    # Neural net architecture
    def forward(x_data, y_data, train=True):
        x, t = Variable(x_data), Variable(y_data)
        h1 = F.dropout(F.relu(model.l1(x)), train=train)
        h2 = F.dropout(F.relu(model.l2(h1)), train=train)
        y = model.l3(h2)
        return F.mean_squared_error(y, t), y

    # Setup optimizer
    optimizer = optimizers.AdaDelta(rho=0.9)
    optimizer.setup(model.collect_parameters())

    # Learning loop
    for epoch in xrange(1, n_epoch + 1):
        print 'epoch', epoch

        # training
        perm = np.random.permutation(N)
        sum_loss = 0

        for i in xrange(0, N, batchsize):
            x_batch = x_train[perm[i:i + batchsize]]
            y_batch = y_train[perm[i:i + batchsize]]
            if args.gpu >= 0:
                x_batch = cuda.to_gpu(x_batch)
                y_batch = cuda.to_gpu(y_batch)

            optimizer.zero_grads()
            loss, pred = forward(x_batch, y_batch)
            loss.backward()
            optimizer.update()

            sum_loss += float(cuda.to_cpu(loss.data)) * batchsize

        print 'train mean loss={}'.format(sum_loss / N)
        '''
    # testing per batch
    sum_loss     = 0
    preds = []
    for i in xrange(0, N_test, batchsize):
      x_batch = x_test[i:i+batchsize]
      y_batch = y_test[i:i+batchsize]
      if args.gpu >= 0:
        x_batch = cuda.to_gpu(x_batch)
        y_batch = cuda.to_gpu(y_batch)

      loss, pred = forward(x_batch, y_batch, train=False)
      preds.extend(cuda.to_cpu(pred.data))
      sum_loss     += float(cuda.to_cpu(loss.data)) * batchsize
    pearson = np.corrcoef(np.asarray(preds).reshape(len(preds),), np.asarray(y_test).reshape(len(preds),))
    #'''

        #'''
        # testing all data
        preds = []
        x_batch = x_test[:]
        y_batch = y_test[:]
        if args.gpu >= 0:
            x_batch = cuda.to_gpu(x_batch)
            y_batch = cuda.to_gpu(y_batch)
        loss, pred = forward(x_batch, y_batch, train=False)
        preds = cuda.to_cpu(pred.data)
        sum_loss = float(cuda.to_cpu(loss.data)) * len(y_test)
        pearson = np.corrcoef(
            np.asarray(preds).reshape(len(preds), ),
            np.asarray(y_test).reshape(len(preds), ))
        #'''

        print 'test  mean loss={}, corrcoef={}'.format(sum_loss / N_test,
                                                       pearson[0][1])

        # Dump data for plot:
        fp1 = file('/tmp/nn_test%04i.dat' % epoch, 'w')
        for x, y in zip(x_test, preds):
            fp1.write(
                '%s #%i# %s\n' %
                (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y))))
        fp1.close()
Example #29
0
    l1 = da1.model.encoder,
    l2 = da2.model.encoder,
    l3 = F.Linear(49, 10),
    )

# 前向き学習の定義
def forward(x_data, y_data, train=True):
    x, t = Variable(x_data), Variable(y_data)
    h1 = F.dropout(F.sigmoid(da3.l1(x)),  train=train)
    h2 = F.dropout(F.sigmoid(da3.l2(h1)), train=train)
    y = da3.l3(h2)
    return F.softmax_cross_entropy(y, t), F.accuracy(y, t)

# optimizerの定義
optimizer = optimizers.Adam()
optimizer.setup(da3.collect_parameters())

# 学習
n_epoch = 200
all_loss_accuracy = []
for epoch in xrange(n_epoch):
    print 'epoch', epoch
    indexes = np.random.permutation(N)
    loss_accuracy = []
    sum_loss, sum_accuracy = 0, 0
    for i in xrange(0, N, batchsize):
        x_batch = x_train[indexes[i:i+batchsize]]
        y_batch = y_train[indexes[i:i+batchsize]]
        optimizer.zero_grads()
        loss, accuracy = forward(x_batch, y_batch)
        loss.backward()
Example #30
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1 #original: 4

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        hidden_dim = 256
        self.model = FunctionSet(
            l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)),
            q_value=F.Linear(hidden_dim, self.num_of_actions,
                             initialW=np.zeros((self.num_of_actions, hidden_dim),
                                               dtype=np.float32))
        )
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        q = self.model.q_value(h4)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        q = self.model_target.q_value(h4)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #31
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor #-------0.99---0.39--0.99
    initial_exploration = 10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4#------
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  #original: 4

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        hidden_dim = 256  # 256--128---256
        self.model = FunctionSet(
            l4=F.Linear(self.dim * self.hist_size,
                        hidden_dim,
                        wscale=np.sqrt(2)),  #wscall=np.sqrt(2)---None--
            q_value=F.Linear(hidden_dim,
                             self.num_of_actions,
                             initialW=np.zeros(
                                 (self.num_of_actions, hidden_dim),
                                 dtype=np.float32)))
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)
        #self.optimizer = optimizers.Adam(alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-08)#alpha=0.0015-0.0125-0.005-0.001
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]r, int8----------------------------------------
        self.d = [
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.float32),
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[
            0]  #188行s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim),
        # dtype=np.float32) =32
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions),
                            dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time, state, action, reward, state_dash,
                         episode_end_flag):  #def agent_end
        self.data_index = time % self.data_size  #---------------------------------------------------data_index

        if episode_end_flag is True:
            self.d[0][self.data_index] = state
            self.d[1][self.data_index] = action
            self.d[2][self.data_index] = reward
            #print d[2]
        else:
            self.d[0][self.data_index] = state
            self.d[1][self.data_index] = action
            self.d[2][self.data_index] = reward

            self.d[3][self.data_index] = state_dash
        self.d[4][self.data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                         self.dim),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                              self.dim),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                # if not (r_replay[i] == 1.):#--------------
                #     r_replay[i] = -3.#-----------------
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(
                s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay
            )  # def forward(self, state, action, reward, state_dash, episode_end):
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))  #---------- F.leaky_relu
        #dp4 = F.dropout(h4, ratio=0.4, train=True)#----------ratio=0.3--0.4
        q = self.model.q_value(h4)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))  #--------F.leaky_
        #dp4 = F.dropout(h4, ratio=0.3, train=True)#------------- dp4 = F.dropout(h4, ratio=0.3, train=True)
        q = self.model_target.q_value(h4)  #dp4
        return q

    def e_greedy(self, state, epsilon):  #agent.start().eps =1.0
        s = Variable(state)
        q = self.q_func(s)
        q = q.data
        #print q.data.size#-----------------------------------------------------------------------------

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),  #¥---------------------2 " Random"
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[
            index_of_action]  #[index_of_action]=np.argmax(q)

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #32
0
class DN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 1, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Breakout"

        print "Initializing DN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            l4=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l5=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)),
            l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32)),
            q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True)
        ).to_gpu()
        
        if args.resumemodel:
            # load saved model
            serializers.load_npz(args.resumemodel, self.model)
            print "load model from resume.model"
        

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        if args.resumeD1 and args.resumeD2:
            # load saved D1 and D2
            npz_tmp1 = np.load(args.resumeD1)
            print "finished loading half of D data"
            npz_tmp2 = np.load(args.resumeD2)
            self.D = [npz_tmp1['D0'],
                      npz_tmp1['D1'],
                      npz_tmp1['D2'],
                      npz_tmp2['D3'],
                      npz_tmp2['D4']]
            npz_tmp1.close()
            npz_tmp2.close()
            print "loaded stored all D data"
        else:
            self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros(self.data_size, dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.int8),
                      np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.bool)]
            print "initialized D data"

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value
        # Generate Target Signals
        tmp2 = self.Q_func(s_dash)
        tmp2 = list(map(np.argmax, tmp2.data.get()))  # argmaxQ(s',a)
        tmp = self.Q_func_target(s_dash)  # Q'(s',*)
        tmp = list(tmp.data.get())
        # select Q'(s',*) due to argmaxQ(s',a)
        res1 = []
        for i in range(num_of_batch):
            res1.append(tmp[i][tmp2[i]])

        #max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        max_Q_dash = np.asanyarray(res1, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)
        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_
        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        print 'now Q_func is implemented'
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        h4 = F.relu(self.model.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model.l5(h3)) # right side connected with A value
        h6 = self.model.l6(h4) # s value
        h7 = self.model.l7(h5) # A value
        Q = self.model.q_value(h6, h7) # Q value
        return Q

    def Q_func_target(self, state):
        print 'now Q_func_target is implemented'
        h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model_target.l2(h1))
        h3 = F.relu(self.model_target.l3(h2))
        h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value
        h6 = self.model_target.l6(h4) # s value
        h7 = self.model_target.l7(h5) # A value
        Q = self.model_target.q_value(h6, h7) # Q value
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #33
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # 報酬の割引率

    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4

    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1 #original: 4
    save_model_freq = 10**4 # モデルを保存する頻度

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")
        print("Input Dim of Q-Network : ",self.dim*self.hist_size)

        hidden_dim = 256
        self.model = FunctionSet(
            l4=F.Linear(self.dim*self.hist_size, hidden_dim,
                            wscale=np.sqrt(2)),
            q_value=F.Linear(hidden_dim, self.num_of_actions,
                            initialW=np.zeros((self.num_of_actions, hidden_dim),
                            dtype=np.float32))
        )

        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [np.zeros((self.data_size, self.hist_size, self.dim),
                    dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.hist_size, self.dim),
                    dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time,state, action, reward,
                        state_dash,episode_end_flag):

        data_index = time % self.data_size #timeを引数に入れることでqueueを実現
        if episode_end_flag is True: # ep_endがTrueならstate_dashが全て0になる
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            if time < self.data_size: #during the first sweep of the History
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        q = self.model.q_value(h4)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        q = self.model_target.q_value(h4)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)

    def save_model(self,folder,time):
        try:
            model_path = "./%s/%dmodel"%(folder,time)
            serializers.save_npz(model_path,self.model)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()
        print "model is saved!!(Model_Path=%s)"%(model_path)
        print "----------------------------------------------"


    def load_model(self,folder,model_num):
        try:
            model_path = "./%s/%dmodel"%(folder,model_num)
            serializers.load_npz(model_path,self.model)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()
        print "model load is done!!(Model_Path=%s)"%(model_path)
        print "----------------------------------------------"
        self.target_model_update()