class Model1: def __init__(self, model): if isinstance(model, tuple): input_dims, n_units, output_dims = model self.model = FunctionSet(l1=F.Linear(input_dims, n_units), l2=F.Linear(n_units, n_units), l3=F.Linear(n_units, output_dims)) else: self.model = model def __call__(self): return self.model # Neural net architecture # ニューラルネットの構造 def forward(self, x_data, y_data, train=True): x = Variable(x_data) if not y_data is None: t = Variable(y_data) h1 = F.dropout(F.relu(self.model.l1(x)), train=train) h2 = F.dropout(F.relu(self.model.l2(h1)), train=train) y = self.model.l3(h2) if not y_data is None: # 多クラス分類なので誤差関数としてソフトマックス関数の # 交差エントロピー関数を用いて、誤差を導出 return F.softmax_cross_entropy(y, t), F.accuracy(y, t), y else: return y def evaluate(self, x_data): return self.forward(x_data, None, train=False)
class CNN3_Model(ModelBase): u"""see: http://aidiary.hatenablog.com/entry/20151007/1444223445""" def __init__(self, input_size=32): super(CNN3_Model, self).__init__() # F.Convolution2D(in_channel, out_channel, filter_size) self.model = FunctionSet( # 1*32*32 -(conv)-> 20*28*28 -(pool)-> 20*14*14 conv1=F.Convolution2D(1, 20, 5), # 20*14*14 -(conv)-> 50*10*10 -(pool)-> 50*5*5=1250 conv2=F.Convolution2D(20, 50, 5), l1=F.Linear(1250, 300), l2=F.Linear(300, 2)) def forward(self, x_data, y_data, train=True): u"""return loss, accuracy""" x, t = Variable(x_data), Variable(y_data) h1 = F.max_pooling_2d(F.relu(self.model.conv1(x)), 2) h2 = F.max_pooling_2d(F.relu(self.model.conv2(h1)), 2) h3 = F.dropout(F.relu(self.model.l1(h2)), train=train) y = self.model.l2(h3) # 多クラス分類なので誤差関数としてソフトマックス関数の # 交差エントロピー関数を用いて、誤差を導出。最低でもlossは必要 return { "loss": F.softmax_cross_entropy(y, t), "accuracy": F.accuracy(y, t) }
class CNN_class: def __init__(self): self.model = FunctionSet(l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2))) self.model.l1.W = np.load('elite/l1_W.npy') self.model.l1.b = np.load('elite/l1_b.npy') self.model.l2.W = np.load('elite/l2_W.npy') self.model.l2.b = np.load('elite/l2_b.npy') self.model.l3.W = np.load('elite/l3_W.npy') self.model.l3.b = np.load('elite/l3_b.npy') def CNN_forward(self, state): h1 = F.relu(self.model.l1(state / 254.0)) h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) return h3
class DeepLearning: def __init__(self, input_size, hidden_size, output_size): self.model = FunctionSet(l1=F.Linear(input_size, hidden_size), l2=F.Linear(hidden_size, hidden_size), l3=F.Linear(hidden_size, output_size)) self.optimizer = optimizers.Adam() self.optimizer.setup(self.model.collect_parameters()) def batch(self, X_train, y_train, batch_size, perm): train_size = X_train.shape[0] for i in xrange(0, train_size, batch_size): X_batch = X_train[perm[i: i+batch_size]] y_batch = y_train[perm[i: i+batch_size]] # Chainer用に型変換 x = Variable(X_batch) t = Variable(y_batch) self.optimizer.zero_grads() y = self.forward(x) # 予測結果 loss = F.softmax_cross_entropy(y, t) loss.backward() self.optimizer.update() def forward(self, x, train=True): h1 = F.dropout(F.sigmoid(self.model.l1(x)), train=train) h2 = F.dropout(F.sigmoid(self.model.l2(h1)), train=train) return self.model.l3(h2) def predicate(self, x_data): x = np.array([x_data], dtype=np.float32) x = Variable(x) y = self.forward(x, train=False) return np.argmax(y.data) def save(self, fpath): pickle.dump(self.model, open(fpath, 'wb'), -1) def load(self, fpath): self.model = pickle.load(open(fpath,'rb'))
class LogisticRegressionEstimator(ChainerClassifier): def __init__(self, net_hidden=100, net_out=5, **params): ChainerClassifier.__init__(self, **params) self.net_hidden = net_hidden self.net_out = net_out self.param_names.append('net_hidden') self.param_names.append('net_out') def setup_network(self, n_features): self.network = FunctionSet(l1=F.Linear(n_features, self.net_hidden), l2=F.Linear(self.net_hidden, self.net_out)) def forward_inner(self, x, train=True): h = F.relu(self.network.l1(x)) y = self.network.l2(h) return y
class DQN_CNN(object): def __init__(self,n_act): N_output = n_act self.model = FunctionSet( conv1=F.Convolution2D(1, 16, 3, pad=1), conv2=F.Convolution2D(16, 16, 3, pad=1), l1=F.Linear(256, 256), l2=F.Linear(256, N_output)) def Q_func(self,x): N,h,w=x.shape x=x.reshape(N,1,h,w) x = Variable(x) h = F.relu(self.model.conv1(x)) h = F.max_pooling_2d(F.relu(self.model.conv2(h)), 2) h = F.relu(self.model.l1(h)) y = self.model.l2(h) return y
class LogisticRegressionEstimator(ChainerClassifier): def __init__(self, net_hidden=100, net_out=5, **params): ChainerClassifier.__init__(self, **params) self.net_hidden = net_hidden self.net_out = net_out self.param_names.append('net_hidden') self.param_names.append('net_out') def setup_network(self, n_features): self.network = FunctionSet( l1 = F.Linear(n_features, self.net_hidden), l2 = F.Linear(self.net_hidden, self.net_out) ) def forward_inner(self, x, train=True): h = F.relu(self.network.l1(x)) y = self.network.l2(h) return y
class DeepLearning: def __init__(self, input_size, hidden_size, output_size): self.model = FunctionSet(l1=F.Linear(input_size, hidden_size), l2=F.Linear(hidden_size, hidden_size), l3=F.Linear(hidden_size, output_size)) self.optimizer = optimizers.Adam() self.optimizer.setup(self.model.collect_parameters()) def batch(self, X_train, y_train, batch_size, perm): train_size = X_train.shape[0] for i in xrange(0, train_size, batch_size): X_batch = X_train[perm[i:i + batch_size]] y_batch = y_train[perm[i:i + batch_size]] # Chainer用に型変換 x = Variable(X_batch) t = Variable(y_batch) self.optimizer.zero_grads() y = self.forward(x) # 予測結果 loss = F.softmax_cross_entropy(y, t) loss.backward() self.optimizer.update() def forward(self, x, train=True): h1 = F.dropout(F.sigmoid(self.model.l1(x)), train=train) h2 = F.dropout(F.sigmoid(self.model.l2(h1)), train=train) return self.model.l3(h2) def predicate(self, x_data): x = np.array([x_data], dtype=np.float32) x = Variable(x) y = self.forward(x, train=False) return np.argmax(y.data) def save(self, fpath): pickle.dump(self.model, open(fpath, 'wb'), -1) def load(self, fpath): self.model = pickle.load(open(fpath, 'rb'))
class DQN_NN(object): def __init__(self,n_act): self.N_input = 64 N_output = n_act #N_unit = (self.N_input-1)*2 N_unit = 64 self.model = FunctionSet( l1=F.Linear(self.N_input,N_unit), #l2=F.Linear(N_unit, N_unit), #l3=F.Linear(N_unit, N_unit), l4=F.Linear(N_unit, N_output,initialW=np.zeros((N_output, N_unit), dtype=np.float32))) def Q_func(self,x): N,h,w=x.shape x=x.reshape(N,h*w) x = Variable(x) h = F.leaky_relu(self.model.l1(x)) #h = F.leaky_relu(self.model.l2(h)) #h = F.leaky_relu(self.model.l3(h)) y = self.model.l4(h) return y
class NN3_Model(ModelBase): def __init__(self, input_dim=748, n_units=1000): super(NN3_Model, self).__init__() self.n_units = n_units self.model = FunctionSet(l1=F.Linear(input_dim, n_units), l2=F.Linear(n_units, n_units), l3=F.Linear(n_units, 2)) def forward(self, x_data, y_data, train=True): u"""return loss, accuracy""" x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(self.model.l1(x)), train=train) h2 = F.dropout(F.relu(self.model.l2(h1)), train=train) y = self.model.l3(h2) # 多クラス分類なので誤差関数としてソフトマックス関数の # 交差エントロピー関数を用いて、誤差を導出。最低でもlossは必要 return { "loss": F.softmax_cross_entropy(y, t), "accuracy": F.accuracy(y, t) }
class CNN_class: def __init__(self): self.model = FunctionSet( l1 = F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2 = F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3 = F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)) ) self.model.l1.W = np.load('elite/l1_W.npy') self.model.l1.b = np.load('elite/l1_b.npy') self.model.l2.W = np.load('elite/l2_W.npy') self.model.l2.b = np.load('elite/l2_b.npy') self.model.l3.W = np.load('elite/l3_W.npy') self.model.l3.b = np.load('elite/l3_b.npy') def CNN_forward(self, state): h1 = F.relu(self.model.l1(state / 254.0)) h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) return h3
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization for Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)), l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)), l3=F.Linear(2592, 256), q_value=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32)) ).to_gpu() print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals max_Q_dash_ = self.Q_func(s_dash) tmp = list(map(np.max, max_Q_dash_.data.get())) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) target[i, self.action_to_index(action[i])] = tmp_ loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0, 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
# output loss f = open(loss_output, 'w') for i in range(1, n_epoch + 1): strs = '{0:05.0f} {1:.2f} {2:.2f}\n'.format(i, loss_train[i - 1], loss_test[i - 1]) f.writelines(strs) f.close() # test print('-----') print('starting to make test data with model') x = Variable(cuda.to_gpu(x_test[1000:1000 + 200].reshape((200, default_bitrate)))) h1 = F.dropout(F.relu(model.l1(x)), train=False) y = F.dropout(model.l2(h1), train=False) #print(x.data) #print(y.data) x_range = np.arange(0, default_bitrate * 200, 1) print('test mean loss={}'.format(F.mean_squared_error(y, x).data)) #print(x.data.ndim, x_range.ndim) #plt.plot(x_range, y.data[0]) #plt.plot(x_range, t.data[0]) #plt.show() x_datas = [] t_datas = [] y_datas = [] x_datas.extend(x_range)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100#10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 #10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." print "Model Building" self.CNN_model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), ) self.model = FunctionSet( l4=F.Linear(3136, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32)) ).to_gpu() d = 'elite/' self.CNN_model.l1.W.data = np.load(d+'l1_W.npy')#.astype(np.float32) self.CNN_model.l1.b.data = np.load(d+'l1_b.npy')#.astype(np.float32) self.CNN_model.l2.W.data = np.load(d+'l2_W.npy')#.astype(np.float32) self.CNN_model.l2.b.data = np.load(d+'l2_b.npy')#.astype(np.float32) self.CNN_model.l3.W.data = np.load(d+'l3_W.npy')#.astype(np.float32) self.CNN_model.l3.b.data = np.load(d+'l3_b.npy')#.astype(np.float32) self.CNN_model = self.CNN_model.to_gpu() self.CNN_model_target = copy.deepcopy(self.CNN_model) self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool), np.zeros((self.data_size, 1), dtype=np.uint8)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, lstm_reward, state_dash, episode_end_flag, ale_reward): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = lstm_reward self.D[5][data_index] = ale_reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = lstm_reward self.D[3][data_index] = state_dash self.D[5][data_index] = ale_reward self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.CNN_model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model.l2(h1)) h3 = F.relu(self.CNN_model.l3(h2)) h4 = F.relu(self.model.l4(h3)) #test now #print h3.data.shape Q = self.model.q_value(h4) return Q def Q_func_LSTM(self, state): h1 = F.relu(self.CNN_model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model.l2(h1)) h3 = F.relu(self.CNN_model.l3(h2)) return h3.data.get() def Q_func_target(self, state): h1 = F.relu(self.CNN_model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model_target.l2(h1)) h3 = F.relu(self.CNN_model_target.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model_target.q_value(h4) return Q def LSTM_reward(self, lstm_out, state_next): lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2)) return lstm_reward def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
# -*- coding: utf-8 -*- import chainer.links as L import chainer.functions as F from chainer import FunctionSet, Variable from chainer import optimizers, cuda, serializers import numpy as np ##ユニット一つのモデルを作成してみた## model = FunctionSet(l1 = L.Linear(4, 1)) # # 1つのユニット。4つの入力と1つの出力。 x_data = np.random.rand(1, 4) * 100 # 4つのランダムな配列を作成 x_data = x_data.astype(np.float32) # 変換をする必要があった x = Variable(x_data) # Variableはキャスト print(float(model.l1(x).data))
class DQN_class: gamma = 0.99 #initial_exploration = 10**2 initial_exploration = 10**2 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 #data_size = 10**6 data_size = 10**6 def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8]): # """ [ 0, 0], # [ 0, 1], # [ 0,-1], # [ 1, 0], # [ 1, 1], # [ 1,-1], # [-1, 0], # [-1, 1], # [-1,-1]]):""" self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller print "Initializing DQN..." print "CUDA init" #cuda.init() print "Model Building" self.model = FunctionSet( #l1 = F.Linear(INPUT_SIZE, 5000), # input map[100, 100] + v[2] + w[1] + wp[2] l1 = F.Linear(INPUT_SIZE, 100), # input map[100, 100] + v[2] + w[1] + wp[2] #l2 = F.Linear(5000, 1000), #l3 = F.Linear(1000, 500), #l4 = F.Linear(500, 100), #l5 = F.Linear(100, self.num_of_actions, l2 = F.Linear(100, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 100), dtype=np.float32)) ).to_gpu() self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) ### 重要!!!! RMSProp!! self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.float32), np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32), np.zeros((self.data_size, 1), dtype=np.bool)] #self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8), # np.zeros(self.data_size, dtype=np.uint8), # np.zeros((self.data_size, 1), dtype=np.int8), # np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8), # np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) #action_index = self.action_to_index(action[i]) #target[i, action_index] = tmp_ target[i, action[i]] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error #print "td-error" #print "np.max(td.data) : ", #print np.max(td.data.get()) # 何のためにあるのか不明 td = td_clipとなっている td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) #print "td_clip.data :", #print td_clip.data zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))).astype(np.float32)) #zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions)))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q # Dataを保存 def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag # mini batch学習 def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) #s_replay = np.ndarray(shape=(self.replay_size, 100, 100), dtype=np.float32) s_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] if i == 0: print "s", s_replay[0][0], s_replay[0][1]*180/np.pi print "a", a_replay[0] print "s\'", s_dash_replay[0][0], s_dash_replay[0][1]*180/np.pi print "r", r_replay[0] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() ### 逆伝播 self.optimizer.update() ### 学習!!!!!!!!!, ネットワークの更新 def Q_func(self, state): #h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs into [0.0 1.0] h1 = F.relu(self.model.l1(state)) # scale inputs into [0.0 1.0] #h2 = F.relu(self.model.l2(h1)) #h3 = F.relu(self.model.l3(h2)) #h4 = F.relu(self.model.l4(h3)) #Q = self.model.l5(h4) Q = self.model.l2(h1) return Q def Q_func_target(self, state): #h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs into [0.0 1.0] h1 = F.relu(self.model_target.l1(state)) # scale inputs into [0.0 1.0] #h2 = F.relu(self.model_target.l2(h1)) #h3 = F.relu(self.model_target.l3(h2)) #h4 = F.relu(self.model_target.l4(h3)) #Q = self.model.l5(h4) Q = self.model.l2(h1) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: #index_action = np.random.randint(0, self.num_of_actions) action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: #index_action = np.argmax(Q.get()) action = np.argmax(Q.get()) print "GREEDY" #return self.index_to_action(index_action), Q return action, Q def action_to_vec(self, action, vec): # """ [ 0, 0], # [ 0, 1], # [ 0,-1], # [ 1, 0], # [ 1, 1], # [ 1,-1], # [-1, 0], # [-1, 1], # [-1,-1]]):""" #vec = Twist() if action == 3 or action == 4 or action == 5: #vec.linear.x += 0.1 vec.linear.x = 0.3 elif action == 6 or action == 7 or action == 8: #vec.linear.x -= 0.1 vec.linear.x = -0.3 else: vec.linear.x = 0.0 if action == 1 or action == 4 or action == 7: #vec.angular.z += 0.1 vec.angular.z = 0.3 elif action == 2 or action == 5 or action == 8: #vec.angular.z -= 0.1 vec.angular.z = -0.3 else: vec.angular.z = 0.0 if vec.linear.x > 1: vec.linear.x = 1 elif vec.linear.x < -1: vec.linear.x = -1 if vec.angular.z > 1: vec.angular.z = 1 elif vec.angular.z < -1: vec.angular.z = -1 return vec
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 5*10**4 # 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**6 # Data size of history. original: 10^6 num_of_actions = 2 # Action dimention num_of_states = 12 # State dimention def __init__(self): print "Initializing DQN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" # self.model = FunctionSet( # l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), # l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), # l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), # l4=F.Linear(3136, 512, wscale=np.sqrt(2)), # q_value=F.Linear(512, self.num_of_actions, # initialW=np.zeros((self.num_of_actions, 512), # dtype=np.float32)) # ).to_gpu() # self.critic = FunctionSet( # l1=F.Linear(self.num_of_actions+self.num_of_states,512), # l2=F.Linear(512,256), # l3=F.Linear(256,128), # q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32)) # ).to_gpu() # # self.actor = FunctionSet( # l1=F.Linear(self.num_of_states,512), # l2=F.Linear(512,256), # l3=F.Linear(256,128), # a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32)) # ).to_gpu() self.critic = FunctionSet( l1=F.Linear(self.num_of_actions+self.num_of_states,1024), l2=F.Linear(1024,512), l3=F.Linear(512,256), l4=F.Linear(256,128), q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32)) ).to_gpu() self.actor = FunctionSet( l1=F.Linear(self.num_of_states,1024), l2=F.Linear(1024,512), l3=F.Linear(512,256), l4=F.Linear(256,128), a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32)) ).to_gpu() # self.critic = FunctionSet( # l1=F.Linear(self.num_of_actions+self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_actions+self.num_of_states)), # l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)), # l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)), # l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)), # q_value=F.Linear(128,1,wscale=0.01*math.sqrt(128)) # ).to_gpu() # # self.actor = FunctionSet( # l1=F.Linear(self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_states)), # l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)), # l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)), # l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)), # a_value=F.Linear(128,self.num_of_actions,wscale=0.01*math.sqrt(128)) # ).to_gpu() self.critic_target = copy.deepcopy(self.critic) self.actor_target = copy.deepcopy(self.actor) print "Initizlizing Optimizer" #self.optim_critic = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001) #self.optim_actor = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001) self.optim_critic = optimizers.Adam(alpha=0.00001) self.optim_actor = optimizers.Adam(alpha=0.00001) self.optim_critic.setup(self.critic) self.optim_actor.setup(self.actor) # self.optim_critic.add_hook(chainer.optimizer.WeightDecay(0.00001)) # self.optim_critic.add_hook(chainer.optimizer.GradientClipping(10)) # self.optim_actor.add_hook(chainer.optimizer.WeightDecay(0.00001)) # self.optim_actor.add_hook(chainer.optimizer.GradientClipping(10)) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, self.num_of_states), dtype=np.float32), np.zeros((self.data_size, self.num_of_actions), dtype=np.float32), np.zeros((self.data_size, 1), dtype=np.float32), np.zeros((self.data_size, self.num_of_states), dtype=np.float32), np.zeros((self.data_size, 1), dtype=np.bool)] # with open('dqn_dump.json', 'a') as f: # json.dump(datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), f) # f.write('\n') # json.dump({"alpha": 0.00001, "beta1": 0.7, "beta2": 0.999, "weight_decay": 0.00001}, f) # f.write('\n') # f.close() #self.x_PID = Hover_PID_Controller(12.1, 1.25) #self.y_PID = Hover_PID_Controller(12.1, 1.25) def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(cuda.to_gpu(np.concatenate([state, action],1))) s_dash = Variable(cuda.to_gpu(state_dash)) Q = self.Q_func(s) # Get Q-value # Generate Target through target nets action_dash_tmp = self.A_func_target(s_dash) action_dash = np.asanyarray(action_dash_tmp.data.get(), dtype=np.float32) tmp_dash = Variable(cuda.to_gpu(np.concatenate([state_dash, action_dash],1))) Q_dash_tmp = self.Q_func_target(tmp_dash) Q_dash = np.asanyarray(Q_dash_tmp.data.get(), dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = Reward[i] + self.gamma * Q_dash[i] else: tmp_ = Reward[i] target[i] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, 1), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def updateActor(self, state): num_of_batch = state.shape[0] A_max = 1.0 A_min = -1.0 A = self.A_func(Variable(cuda.to_gpu(state))) tmp = Variable(cuda.to_gpu(np.concatenate([state, A.data.get()],1))) Q = self.Q_func(tmp) # Backward prop towards actor net #self.critic.zerograds() #self.actor.zerograds() Q.grad = cuda.to_gpu(np.ones((num_of_batch, 1), dtype=np.float32)*(-1.0)) # Q.grad = Q.data*(-1.0) Q.backward() A.grad = tmp.grad[:,-self.num_of_actions:] print("sample_A.grad: "+str(A.grad[0])) for i in xrange(num_of_batch): for j in xrange(self.num_of_actions): if A.grad[i][j] < 0: A.grad[i][j] *= (A_max-A.data[i][j])/(A_max-A_min) elif A.grad[i][j] > 0: A.grad[i][j] *= (A.data[i][j]-A_min)/(A_max-A_min) A.backward() self.optim_actor.update() print("sample_A.grad: "+str(A.grad[0])) def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) #reward_list = list(self.D[2]) #replay_index = [i[0] for i in sorted(enumerate(reward_list),key=itemgetter(1),reverse=True)[:32]] #replay_index = np.asarray(replay_index).reshape(32,1) s_replay = np.ndarray(shape=(self.replay_size, self.num_of_states), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, self.num_of_actions), dtype=np.float32) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.num_of_states), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = np.asarray(self.D[1][replay_index[i]], dtype=np.float32) r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.asarray(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] #s_replay = cuda.to_gpu(s_replay) #s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based critic update self.optim_critic.zero_grads() loss, q = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optim_critic.update() # Update the actor self.optim_critic.zero_grads() self.optim_actor.zero_grads() self.updateActor(s_replay) self.soft_target_model_update() print "AVG_Q %f" %(np.average(q.data.get())) print("loss " + str(loss.data)) # with open('dqn_dump.json', 'a') as f: # json.dump({"time": time, "avg_Q": float(np.average(q.data.get())), "loss": float(loss.data)}, f) # f.write('\n') # f.close() def Q_func(self, state): # h1 = F.relu(self.critic.l1(state)) # h2 = F.relu(self.critic.l2(h1)) # h3 = F.relu(self.critic.l3(h2)) # Q = self.critic.q_value(h3) h1 = F.relu(self.critic.l1(state)) h2 = F.relu(self.critic.l2(h1)) h3 = F.relu(self.critic.l3(h2)) h4 = F.relu(self.critic.l4(h3)) Q = self.critic.q_value(h4) return Q def Q_func_target(self, state): # h1 = F.relu(self.critic_target.l1(state)) # h2 = F.relu(self.critic_target.l2(h1)) # h3 = F.relu(self.critic.l3(h2)) # Q = self.critic_target.q_value(h3) h1 = F.relu(self.critic_target.l1(state)) h2 = F.relu(self.critic_target.l2(h1)) h3 = F.relu(self.critic_target.l3(h2)) h4 = F.relu(self.critic.l4(h3)) Q = self.critic_target.q_value(h4) return Q def A_func(self, state): # h1 = F.relu(self.actor.l1(state)) # h2 = F.relu(self.actor.l2(h1)) # h3 = F.relu(self.actor.l3(h2)) # A = self.actor.a_value(h3) h1 = F.relu(self.actor.l1(state)) h2 = F.relu(self.actor.l2(h1)) h3 = F.relu(self.actor.l3(h2)) h4 = F.relu(self.actor.l4(h3)) A = self.actor.a_value(h4) return A def A_func_target(self, state): # h1 = F.relu(self.actor_target.l1(state)) # h2 = F.relu(self.actor_target.l2(h1)) # h3 = F.relu(self.actor.l3(h2)) # A = self.actor_target.a_value(h3) h1 = F.relu(self.actor_target.l1(state)) h2 = F.relu(self.actor_target.l2(h1)) h3 = F.relu(self.actor_target.l3(h2)) h4 = F.relu(self.actor.l4(h3)) A = self.actor_target.a_value(h4) return A def e_greedy(self, state, epsilon): s = Variable(state) A = self.A_func(s) A = A.data if np.random.rand() < epsilon: action = np.random.uniform(-1.,1.,(1,self.num_of_actions)).astype(np.float32) # action = np.zeros((1,self.num_of_actions),dtype=np.float32) # if state[0,0] > 0: # action[0,0] = np.random.uniform(0.0,0.5) # elif state[0,0] < 0: # action[0,0] = np.random.uniform(-0.5,0.0) # if state[0,1] < 0: # action[0,1] = np.random.uniform(0.0,0.5) # elif state[0,1] > 0: # action[0,1] = np.random.uniform(-0.5,0.0) #print("teststate"+str(state)) #action[0,0] = -self.x_PID.getCorrection(state[0][0], 0.0) #action[0,1] = self.y_PID.getCorrection(state[0][1], 0.0) print "RANDOM" else: action = A.get() print "GREEDY" #print(str(action)) return action def hard_target_model_update(self): self.critic_target = copy.deepcopy(self.critic) self.actor_target = copy.deepcopy(self.actor) def soft_target_model_update(self, tau=0.001): self.critic_target.l1.W.data = tau*self.critic.l1.W.data + (1-tau)*self.critic_target.l1.W.data self.critic_target.l2.W.data = tau*self.critic.l2.W.data + (1-tau)*self.critic_target.l2.W.data self.critic_target.l3.W.data = tau*self.critic.l3.W.data + (1-tau)*self.critic_target.l3.W.data self.critic_target.l4.W.data = tau*self.critic.l4.W.data + (1-tau)*self.critic_target.l4.W.data self.critic_target.q_value.W.data = tau*self.critic.q_value.W.data + (1-tau)*self.critic_target.q_value.W.data self.actor_target.l1.W.data = tau*self.actor.l1.W.data + (1-tau)*self.actor_target.l1.W.data self.actor_target.l2.W.data = tau*self.actor.l2.W.data + (1-tau)*self.actor_target.l2.W.data self.actor_target.l3.W.data = tau*self.actor.l3.W.data + (1-tau)*self.actor_target.l3.W.data self.actor_target.l4.W.data = tau*self.actor.l4.W.data + (1-tau)*self.actor_target.l4.W.data self.actor_target.a_value.W.data = tau*self.actor.a_value.W.data + (1-tau)*self.actor_target.a_value.W.data
class ConvQAgent(Agent): def __init__(self, frames_per_action=4): super(ConvQAgent, self).__init__() cuda.init() self.epsilon = 1.0 self.gamma = 0.99 self.iterations = 0 self.model = FunctionSet( l1 = F.Convolution2D(frames_per_action, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2 = F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3 = F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), l4 = F.Linear(64 * 7 * 7, 512), l5 = F.Linear(512, 2) ).to_gpu() self.optimizer = optimizers.RMSprop(lr=1e-5) self.optimizer.setup(self.model) self.update_target() self.num_frames = 0 self.frames_per_action = frames_per_action self.prev_reward = 0.0 self.history = ConvHistory((frames_per_action, 84, 84)) def update_target(self): self.target_model = copy.deepcopy(self.model) self.target_model = self.target_model.to_gpu() def act(self, state): self.update_state_vector(state) if self.num_frames < self.frames_per_action - 1 or self.num_frames % self.frames_per_action != 0: return None if random.random() < 0.001: print 'Epsilon: {}'.format(self.epsilon) if self.epsilon > 0.05: self.epsilon -= (0.95 / 300000) if random.random() < self.epsilon: return random.random() > 0.375 q = self.get_q(Variable(cuda.to_gpu(self.curr_state[np.newaxis, :, :, :]))) if random.random() < 0.01: if q.data[0,1] > q.data[0,0]: print 'On: {}'.format(q.data) else: print 'Off: {}'.format(q.data) return q.data[0,1] > q.data[0,0] def update_state_vector(self, state): if self.num_frames < self.frames_per_action: if self.num_frames == 0: self.curr_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32) self.curr_state[self.num_frames, :, :] = state else: if self.num_frames == self.frames_per_action: self.prev_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32) self.prev_state[1:, :, :] = self.prev_state[:-1, :, :] self.prev_state[0, :, :] = self.curr_state[-1, :, :] self.curr_state[1:, :, :] = self.curr_state[:-1, :, :] self.curr_state[0, :, :] = state self.num_frames += 1 def accept_reward(self, state, action, reward, new_state, is_terminal): self.prev_reward += reward if not (is_terminal or self.num_frames % self.frames_per_action == 0): return if self.num_frames == self.frames_per_action: self.prev_reward = 0.0 self.prev_action = action return self.history.add((self.prev_state, self.prev_action, self.prev_reward, self.curr_state, is_terminal)) self.prev_reward = 0.0 self.prev_action = action self.iterations += 1 if self.iterations % 10000 == 0: print '*** UPDATING TARGET NETWORK ***' self.update_target() state, action, reward, new_state, is_terminal = self.history.get(num=32) state = cuda.to_gpu(state) action = cuda.to_gpu(action) new_state = cuda.to_gpu(new_state) reward = cuda.to_gpu(reward) loss, q = self.forward(state, action, reward, new_state, is_terminal) self.optimizer.zero_grads() loss.backward() self.optimizer.update() def forward(self, state, action, reward, new_state, is_terminal): q = self.get_q(Variable(state)) q_target = self.get_target_q(Variable(new_state)) max_target_q = cp.max(q_target.data, axis=1) target = cp.copy(q.data) for i in xrange(target.shape[0]): curr_action = int(action[i, 0]) if is_terminal[i]: target[i, curr_action] = reward[i] else: target[i, curr_action] = reward[i] + self.gamma * max_target_q[i] loss = F.mean_squared_error(Variable(target), q) return loss, 0.0 #cp.mean(q.data[:, action[i]]) def get_q(self, state): h1 = F.relu(self.model.l1(state)) h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = self.model.l4(h3) return self.model.l5(h4) def get_target_q(self, state): h1 = F.relu(self.target_model.l1(state)) h2 = F.relu(self.target_model.l2(h1)) h3 = F.relu(self.target_model.l3(h2)) h4 = self.target_model.l4(h3) return self.target_model.l5(h4) def save(self, file_name): with open(file_name, 'wb') as out_file: pickle.dump((self.model, self.optimizer), out_file) def load(self, file_name): self.epsilon = 0.0 with open(file_name, 'rb') as in_file: model, optimizer = pickle.load(in_file) self.model.copy_parameters_from(model.parameters) self.optimizer = optimizer def start_new_game(self): self.num_frames = 0
plt.savefig('img/loss.png') # output loss f = open('loss.txt', 'w') for i in range(1, n_epoch + 1): strs = '{0:05.0f} {1:.2f} {2:.2f}\n'.format(i, log_loss_train[i - 1], log_loss_test[i - 1]) f.writelines(strs) f.close() plt.clf() raw_data = np.array(create_data_with_fourier_basis(), dtype=np.float32) data = raw_data.reshape((1, n_sampling_rate)) x, t = Variable(data), Variable(data) #h1 = fixed_model.l1(x) h1 = model.l1(x) y = model.r1(h1) print(h1.data) x_range = np.arange(0, n_sampling_rate, 1) plt.plot(x_range, data[0], label='source') plt.plot(x_range, y.data[0], label='result') plt.legend() #plt.show() plt.savefig('img/example.png') for i in range(0, n_units): plt.clf()
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original replay_size = 32 # Replay (batch) size target_model_update_freq = 10**2 # Target update frequancy. original data_size = 10**5 # Data size of history. original #actions are 0 => do nothing, 1 -> buy, -1 sell def __init__(self, input_vector_length,enable_controller=[0, 1, 2]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" self.input_vector_length = input_vector_length print "Initializing DQN..." # Initialization for Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() #inputs --> 5 * 14 (with 10 temporality) + 5 (of last one hour) + 5 (of last 24 hour) print "Model Building" self.model = FunctionSet( l1=F.Linear(input_vector_length, 500), l2=F.Linear(500, 250), l3=F.Linear(250, 80), q_value=F.Linear(80, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 80), dtype=np.float32)) ).to_gpu() print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals max_Q_dash_ = self.Q_func(s_dash) tmp = list(map(np.max, max_Q_dash_.data.get())) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) target[i, self.action_to_index(action[i])] = tmp_ loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): #todo might want to normalize input, but for now I will do that outside this class h1 = F.relu(self.model.l1(state)) h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100 #10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 30000 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), l4=F.Linear(3136, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32))) #.to_gpu() self.model.l1.W = np.load('elite/l1_W.npy') self.model.l1.b = np.load('elite/l1_b.npy') self.model.l2.W = np.load('elite/l2_W.npy') self.model.l2.b = np.load('elite/l2_b.npy') self.model.l3.W = np.load('elite/l3_W.npy') self.model.l3.b = np.load('elite/l3_b.npy') self.model.l4.W = np.load('elite/l4_W.npy') self.model.l4.b = np.load('elite/l4_b.npy') self.model.q_value.W = np.load('elite/q_value_W.npy') self.model.q_value.b = np.load('elite/q_value_b.npy') self.model_target = copy.deepcopy(self.model) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [ np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(target) - Q td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable( np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] # Gradient-based update loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model.q_value(h4) return Q def Q_func_target(self, state): h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model_target.q_value(h4) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: #index_action = np.argmax(Q.get()) index_action = np.argmax(Q) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
plt.pcolor(Z) plt.title("ans=%d, recog=%d"%(ans, recog), size=8) plt.gray() plt.tick_params(labelbottom="off") plt.tick_params(labelleft="off") plt.figure(figsize=(15,15)) cnt = 0 # for idx in np.random.permutation(N)[:1000]: for idx in range(N): if mod(idx , 1000): pass xxx = x_train[idx].astype(np.float32) h1 = F.dropout(F.relu(model.l1(Variable(xxx.reshape(1,784)))), train=False) h2 = F.dropout(F.relu(model.l2(h1)), train=False) y = model.l3(h2) # 間違えだけ表示 if y_train[idx] != np.argmax(y.data): cnt += 1 draw_digit3(x_train[idx], cnt, y_train[idx], np.argmax(y.data)) plt.show() print("Fin") # ## 第一層パラメータWの可視化 # In[140]:
class MLP(object): def __init__( self, data, target, n_inputs=784, n_hidden=784, n_outputs=10, gpu=-1 ): self.model = FunctionSet( l1=F.Linear(n_inputs, n_hidden), l2=F.Linear(n_hidden, n_hidden), l3=F.Linear(n_hidden, n_outputs) ) if gpu >= 0: self.model.to_gpu() self.x_train, self.x_test = data self.y_train, self.y_test = target self.n_train = len(self.y_train) self.n_test = len(self.y_test) self.gpu = gpu self.optimizer = optimizers.Adam() self.optimizer.setup(self.model) self.train_accuracies = [] self.train_losses = [] self.test_accuracies = [] self.test_losses = [] @property def xp(self): return cuda.cupy if self.gpu >= 0 else numpy def forward(self, x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(self.model.l1(x)), train=train) h2 = F.dropout(F.relu(self.model.l2(h1)), train=train) y = self.model.l3(h2) return F.softmax_cross_entropy(y, t), F.accuracy(y, t) def train_and_test(self, n_epoch=20, batchsize=100): for epoch in xrange(1, n_epoch + 1): logging.info('epoch {}'.format(epoch)) perm = numpy.random.permutation(self.n_train) sum_accuracy = 0 sum_loss = 0 for i in xrange(0, self.n_train, batchsize): x_batch = self.xp.asarray(self.x_train[perm[i:i+batchsize]]) y_batch = self.xp.asarray(self.y_train[perm[i:i+batchsize]]) real_batchsize = len(x_batch) self.optimizer.zero_grads() loss, acc = self.forward(x_batch, y_batch) loss.backward() self.optimizer.update() sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize self.train_accuracies.append(sum_accuracy / self.n_train) self.train_losses.append(sum_loss / self.n_train) logging.info( 'train mean loss={}, accuracy={}'.format( sum_loss / self.n_train, sum_accuracy / self.n_train ) ) # evalation sum_accuracy = 0 sum_loss = 0 for i in xrange(0, self.n_test, batchsize): x_batch = self.xp.asarray(self.x_test[i:i+batchsize]) y_batch = self.xp.asarray(self.y_test[i:i+batchsize]) real_batchsize = len(x_batch) loss, acc = self.forward(x_batch, y_batch, train=False) sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize self.test_accuracies.append(sum_accuracy / self.n_test) self.test_accuracies.append(sum_loss / self.n_test) logging.info( 'test mean loss={}, accuracy={}'.format( sum_loss / self.n_test, sum_accuracy / self.n_test ) )
Z = data.reshape(size, size) Z = Z[::-1, :] # 上下反転 plt.xlim(0, 27) plt.ylim(0, 27) plt.pcolor(Z) plt.title('ans={}, recog={}'.format(ans, recog), size=8) plt.gray() plt.tick_params(labelbottom='off') plt.tick_params(labelleft='off') plt.figure(figsize=(15, 15)) cnt = 0 for idx in np.random.permutation(N)[:100]: xxx = x_train[idx].astype(np.float32) h1 = F.dropout(F.relu(model.l1(Variable(xxx.reshape(1, 784)))), train=False) h2 = F.dropout(F.relu(model.l2(h1)), train=False) y = model.l3(h2) cnt += 1 draw_digit3(x_train[idx], cnt, y_train[idx], np.argmax(y.data)) plt.show() def draw_digit2(data, n, i_): size = 28 plt.subplot(10, 10, n) Z = data.reshape(size, size) Z = Z[::-1, :] # 上下反転 plt.xlim(0, 27)
class MLP(Base): def __init__(self, data=None, target=None, n_inputs=784, n_hidden=784, n_outputs=10, gpu=-1): self.excludes.append('xp') self.model = FunctionSet(l1=F.Linear(n_inputs, n_hidden), l2=F.Linear(n_hidden, n_hidden), l3=F.Linear(n_hidden, n_outputs)) if gpu >= 0: self.model.to_gpu() self.xp = cuda.cupy else: self.xp = np if not data is None: self.x_train, self.x_test = data else: self.x_train, self.y_test = None, None if not target is None: self.y_train, self.y_test = target self.n_train = len(self.y_train) self.n_test = len(self.y_test) else: self.y_train, self.y_test = None, None self.n_train = 0 self.n_test = 0 self.gpu = gpu self.optimizer = optimizers.Adam() self.optimizer.setup(self.model) def forward(self, x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(self.model.l1(x)), train=train) h2 = F.dropout(F.relu(self.model.l2(h1)), train=train) y = self.model.l3(h2) return F.softmax_cross_entropy(y, t), F.accuracy(y, t) def train_and_test(self, n_epoch=20, batchsize=100): for epoch in xrange(1, n_epoch+1): print 'epoch', epoch perm = np.random.permutation(self.n_train) sum_accuracy = 0 sum_loss = 0 for i in xrange(0, self.n_train, batchsize): x_batch = self.xp.asarray(self.x_train[perm[i:i+batchsize]]) y_batch = self.xp.asarray(self.y_train[perm[i:i+batchsize]]) real_batchsize = len(x_batch) self.optimizer.zero_grads() loss, acc = self.forward(x_batch, y_batch) loss.backward() self.optimizer.update() sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize print 'train mean loss={}, accuracy={}'.format(sum_loss/self.n_train, sum_accuracy/self.n_train) # evalation sum_accuracy = 0 sum_loss = 0 for i in xrange(0, self.n_test, batchsize): x_batch = self.xp.asarray(self.x_test[i:i+batchsize]) y_batch = self.xp.asarray(self.y_test[i:i+batchsize]) real_batchsize = len(x_batch) loss, acc = self.forward(x_batch, y_batch, train=False) sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize print 'test mean loss={}, accuracy={}'.format(sum_loss/self.n_test, sum_accuracy/self.n_test)
class SDA(object): def __init__( self, rng, data, target, n_inputs=784, n_hidden=[784,784,784], n_outputs=10, corruption_levels=[0.1,0.2,0.3], gpu=-1): self.model = FunctionSet( l1=F.Linear(n_inputs, n_hidden[0]), l2=F.Linear(n_hidden[0], n_hidden[1]), l3=F.Linear(n_hidden[1], n_hidden[2]), l4=F.Linear(n_hidden[2], n_outputs) ) if gpu >= 0: self.model.to_gpu() self.rng = rng self.gpu = gpu self.data = data self.target = target self.x_train, self.x_test = data self.y_train, self.y_test = target self.n_train = len(self.y_train) self.n_test = len(self.y_test) self.corruption_levels = corruption_levels self.n_inputs = n_inputs self.n_hidden = n_hidden self.n_outputs = n_outputs self.dae1 = None self.dae2 = None self.dae3 = None self.optimizer = None self.setup_optimizer() self.train_accuracies = [] self.train_losses = [] self.test_accuracies = [] self.test_losses = [] def setup_optimizer(self): self.optimizer = optimizers.AdaDelta() self.optimizer.setup(self.model) @property def xp(self): return cuda.cupy if self.gpu >= 0 else numpy def pre_train(self, n_epoch=20, batchsize=100): first_inputs = self.data # initialize first dAE self.dae1 = DA(self.rng, data=first_inputs, n_inputs=self.n_inputs, n_hidden=self.n_hidden[0], corruption_level=self.corruption_levels[0], gpu=self.gpu) # train first dAE logging.info("--------First DA training has started!--------") self.dae1.train_and_test(n_epoch=n_epoch, batchsize=batchsize) self.dae1.to_cpu() # compute second iputs for second dAE tmp1 = self.dae1.compute_hidden(first_inputs[0]) tmp2 = self.dae1.compute_hidden(first_inputs[1]) if self.gpu >= 0: self.dae1.to_gpu() second_inputs = [tmp1, tmp2] # initialize second dAE self.dae2 = DA( self.rng, data=second_inputs, n_inputs=self.n_hidden[0], n_hidden=self.n_hidden[1], corruption_level=self.corruption_levels[1], gpu=self.gpu ) # train second dAE logging.info("--------Second DA training has started!--------") self.dae2.train_and_test(n_epoch=n_epoch, batchsize=batchsize) self.dae2.to_cpu() # compute third inputs for third dAE tmp1 = self.dae2.compute_hidden(second_inputs[0]) tmp2 = self.dae2.compute_hidden(second_inputs[1]) if self.gpu >= 0: self.dae2.to_gpu() third_inputs = [tmp1, tmp2] # initialize third dAE self.dae3 = DA( self.rng, data=third_inputs, n_inputs=self.n_hidden[1], n_hidden=self.n_hidden[2], corruption_level=self.corruption_levels[2], gpu=self.gpu ) # train third dAE logging.info("--------Third DA training has started!--------") self.dae3.train_and_test(n_epoch=n_epoch, batchsize=batchsize) # update model parameters self.model.l1 = self.dae1.encoder() self.model.l2 = self.dae2.encoder() self.model.l3 = self.dae3.encoder() self.setup_optimizer() def forward(self, x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(self.model.l1(x)), train=train) h2 = F.dropout(F.relu(self.model.l2(h1)), train=train) h3 = F.dropout(F.relu(self.model.l3(h2)), train=train) y = self.model.l4(h3) return F.softmax_cross_entropy(y, t), F.accuracy(y, t) def fine_tune(self, n_epoch=20, batchsize=100): for epoch in xrange(1, n_epoch+1): logging.info('fine tuning epoch {}'.format(epoch)) perm = self.rng.permutation(self.n_train) sum_accuracy = 0 sum_loss = 0 for i in xrange(0, self.n_train, batchsize): x_batch = self.xp.asarray(self.x_train[perm[i:i+batchsize]]) y_batch = self.xp.asarray(self.y_train[perm[i:i+batchsize]]) real_batchsize = len(x_batch) self.optimizer.zero_grads() loss, acc = self.forward(x_batch, y_batch) loss.backward() self.optimizer.update() sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize logging.info( 'fine tuning train mean loss={}, accuracy={}'.format( sum_loss / self.n_train, sum_accuracy / self.n_train ) ) self.train_accuracies.append(sum_accuracy / self.n_train) self.train_losses.append(sum_loss / self.n_train) # evaluation sum_accuracy = 0 sum_loss = 0 for i in xrange(0, self.n_test, batchsize): x_batch = self.xp.asarray(self.x_test[i:i+batchsize]) y_batch = self.xp.asarray(self.y_test[i:i+batchsize]) real_batchsize = len(x_batch) loss, acc = self.forward(x_batch, y_batch, train=False) sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize logging.info( 'fine tuning test mean loss={}, accuracy={}'.format( sum_loss / self.n_test, sum_accuracy / self.n_test ) ) self.test_accuracies.append(sum_accuracy / self.n_test) self.test_losses.append(sum_loss / self.n_test) return self.train_accuracies, self.test_accuracies
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100 #10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=convlstm_link.CONVLSTM(7056, 7056), l4=F.Linear(7056, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32))).to_gpu() self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [ np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def reset_state(self): self.model.l2.reset_state() def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable( cuda.to_gpu( np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() ''' def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] #h2 = F.relu(self.model.l2(h1)) #h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h1)) Q = self.model.q_value(h4) return Q def Q_func_target(self, state): h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] #h2 = F.relu(self.model_target.l2(h1)) #h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model.l4(h1)) Q = self.model_target.q_value(h4) return Q ''' def Q_func(self, state): self.model.l1.reset_state() for i in range(4): h1 = F.relu(self.model.l1(state / 254.0)) h4 = F.relu(self.model.l4(h1)) Q = self.model.q_value(h4) return Q def Q_func_target(self, state): self.model_target.l1.reset_state() for i in range(4): h1 = F.relu(self.model_target.l1(state / 254.0)) h4 = F.relu(self.model_target.l4(h1)) Q = self.model_target.q_value(h4) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) #Q = self.Q_func(state) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." print "CUDA init" cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)), l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)), l3=F.Linear(2592, 256), q_value=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32))).to_gpu() print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [ np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals max_Q_dash_ = self.Q_func(s_dash) tmp = list(map(np.max, max_Q_dash_.data.get())) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) target[i, self.action_to_index(action[i])] = tmp_ loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0, 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
print y.data y.grad = np.ones((2, 2), dtype=np.float32) # have to set init grad values y.backward() print f.gW, f.gb y.backward() print f.gW, f.gb f.gW.fill(0), f.gb.fill(0) # have to fill y.backward() print f.gW, f.gb # function set print "# function set" model = FunctionSet( l1=F.Linear(4, 3), l2=F.Linear(3, 2), ) print model model.l3 = F.Linear(2, 2) x = Variable(np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.float32)) h1 = model.l1(x) h2 = model.l2(h1) h3 = model.l3(h2) # have to set init grad values and fill grads, which will be done using optimizer print model.parameters print model.gradients
plt.legend() plt.show() # output loss f = open('loss.txt', 'w') for i in range(1, n_epoch + 1): strs = '{0:05.0f} {1:.2f} {2:.2f}\n'.format(i, log_loss_train[i - 1], log_loss_test[i - 1]) f.writelines(strs) f.close() plt.clf() raw_data = np.array(create_data_with_fourier_basis(), dtype=np.float32) data = raw_data.reshape((1, n_sampling_rate)) x, t = Variable(data), Variable(data) h1 = fixed_model.l1(x) #h1 = model.l1(x) y = model.r1(h1) print(h1.data) x_range = np.arange(0, n_sampling_rate, 1) plt.plot(x_range, data[0], label='source') plt.plot(x_range, y.data[0], label='result') plt.legend() plt.show() for i in range(0, n_units): plt.clf()
class ChainerAgent(Agent): def __init__(self, epsilon=1.0, frames_per_action=4): super(ChainerAgent, self).__init__() cuda.init() self.epsilon = epsilon self.gamma = 0.99 self.iterations = 0 self.model = FunctionSet( l1 = F.Linear(9 * frames_per_action, 256), l2 = F.Linear(256, 256), l3 = F.Linear(256, 256), l4 = F.Linear(256, 2), ).to_gpu() self.optimizer = optimizers.RMSprop(lr=1e-5) self.optimizer.setup(self.model) self.update_target() self.num_frames = 0 self.frames_per_action = frames_per_action self.prev_reward = 0.0 self.history = ChainHistory(state_len=(9 * frames_per_action)) def forward(self, state, action, reward, new_state, is_terminal): q = self.get_q(Variable(state)) q_target = self.get_target_q(Variable(new_state)) max_target_q = cp.max(q_target.data, axis=1) target = cp.copy(q.data) for i in xrange(target.shape[0]): curr_action = int(action[i]) if is_terminal[i]: target[i, curr_action] = reward[i] else: target[i, curr_action] = reward[i] + self.gamma * max_target_q[i] loss = F.mean_squared_error(Variable(target), q) return loss, 0.0 #cp.mean(q.data[:, action[i]]) def get_q(self, state): h1 = F.relu(self.model.l1(state)) h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) return self.model.l4(h3) def get_target_q(self, state): h1 = F.relu(self.target_model.l1(state)) h2 = F.relu(self.target_model.l2(h1)) h3 = F.relu(self.target_model.l3(h2)) return self.target_model.l4(h3) def accept_reward(self, state, action, reward, new_state, is_terminal): self.prev_reward += reward if not (is_terminal or self.num_frames % self.frames_per_action == 0): return if self.num_frames == self.frames_per_action: self.prev_reward = 0.0 self.prev_action = action return self.history.add((self.prev_state, self.prev_action, self.prev_reward, self.curr_state, is_terminal)) self.prev_reward = 0.0 self.prev_action = action self.iterations += 1 if self.iterations % 10000 == 0: print '*** UPDATING TARGET NETWORK ***' self.update_target() state, action, reward, new_state, is_terminal = self.history.get(num=32) state = cuda.to_gpu(state) action = cuda.to_gpu(action) new_state = cuda.to_gpu(new_state) reward = cuda.to_gpu(reward) loss, q = self.forward(state, action, reward, new_state, is_terminal) self.optimizer.zero_grads() loss.backward() self.optimizer.update() def update_state_vector(self, state): if self.num_frames < self.frames_per_action: if self.num_frames == 0: self.curr_state = state else: self.curr_state = np.hstack((self.curr_state, state)) else: if self.num_frames < 2 * self.frames_per_action: if self.num_frames == self.frames_per_action: self.prev_state = np.copy(self.curr_state[:, :9]) else: self.prev_state = np.hstack((self.prev_state, self.curr_state[:, :9])) else: self.prev_state[:, :-9] = self.prev_state[:, 9:] self.prev_state[:, -9:] = self.curr_state[:, :9] self.curr_state[:, :-9] = self.curr_state[:, 9:] self.curr_state[:, -9:] = state self.num_frames += 1 def act(self, state): self.update_state_vector(state) if self.num_frames < self.frames_per_action - 1 or self.num_frames % self.frames_per_action != 0: return None if self.epsilon > 0.05: self.epsilon -= (0.95 / 1000000) if random.random() < 0.0001: print 'Epsilon greedy strategy current epsilon: {}'.format(self.epsilon) if random.random() < self.epsilon: return random.random() > 0.375 q = self.get_q(Variable(cuda.to_gpu(self.curr_state))) if random.random() < 0.01: if q.data[0,1] > q.data[0,0]: print 'On: {}'.format(q.data) else: print 'Off: {}'.format(q.data) return q.data[0,1] > q.data[0,0] def save(self, file_name): with open(file_name, 'wb') as out_file: pickle.dump(self.model, out_file) def load(self, file_name): self.epsilon = 0.0 with open(file_name, 'rb') as in_file: model = pickle.load(in_file) self.model.copy_parameters_from(model.parameters) def update_target(self): self.target_model = copy.deepcopy(self.model) self.target_model = self.target_model.to_gpu() def start_new_game(self): self.num_frames = 0
class MLP(Base): def __init__(self, data=None, target=None, n_inputs=784, n_hidden=784, n_outputs=10, gpu=-1): self.excludes.append('xp') self.model = FunctionSet(l1=F.Linear(n_inputs, n_hidden), l2=F.Linear(n_hidden, n_hidden), l3=F.Linear(n_hidden, n_outputs)) if gpu >= 0: self.model.to_gpu() self.xp = cuda.cupy else: self.xp = np if not data is None: self.x_train, self.x_test = data else: self.x_train, self.y_test = None, None if not target is None: self.y_train, self.y_test = target self.n_train = len(self.y_train) self.n_test = len(self.y_test) else: self.y_train, self.y_test = None, None self.n_train = 0 self.n_test = 0 self.gpu = gpu self.optimizer = optimizers.Adam() self.optimizer.setup(self.model) def forward(self, x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(self.model.l1(x)), train=train) h2 = F.dropout(F.relu(self.model.l2(h1)), train=train) y = self.model.l3(h2) return F.softmax_cross_entropy(y, t), F.accuracy(y, t) def train_and_test(self, n_epoch=20, batchsize=100): for epoch in xrange(1, n_epoch + 1): print 'epoch', epoch perm = np.random.permutation(self.n_train) sum_accuracy = 0 sum_loss = 0 for i in xrange(0, self.n_train, batchsize): x_batch = self.xp.asarray(self.x_train[perm[i:i + batchsize]]) y_batch = self.xp.asarray(self.y_train[perm[i:i + batchsize]]) real_batchsize = len(x_batch) self.optimizer.zero_grads() loss, acc = self.forward(x_batch, y_batch) loss.backward() self.optimizer.update() sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize print 'train mean loss={}, accuracy={}'.format( sum_loss / self.n_train, sum_accuracy / self.n_train) # evalation sum_accuracy = 0 sum_loss = 0 for i in xrange(0, self.n_test, batchsize): x_batch = self.xp.asarray(self.x_test[i:i + batchsize]) y_batch = self.xp.asarray(self.y_test[i:i + batchsize]) real_batchsize = len(x_batch) loss, acc = self.forward(x_batch, y_batch, train=False) sum_loss += float(cuda.to_cpu(loss.data)) * real_batchsize sum_accuracy += float(cuda.to_cpu(acc.data)) * real_batchsize print 'test mean loss={}, accuracy={}'.format( sum_loss / self.n_test, sum_accuracy / self.n_test)
class DN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100#10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 1, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Breakout" print "Initializing DN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), l4=F.Linear(3136, 256, wscale=np.sqrt(2)), l5=F.Linear(3136, 256, wscale=np.sqrt(2)), l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)), l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32)), q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True) ).to_gpu() if args.resumemodel: # load saved model serializers.load_npz(args.resumemodel, self.model) print "load model from resume.model" self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] if args.resumeD1 and args.resumeD2: # load saved D1 and D2 npz_tmp1 = np.load(args.resumeD1) print "finished loading half of D data" npz_tmp2 = np.load(args.resumeD2) self.D = [npz_tmp1['D0'], npz_tmp1['D1'], npz_tmp1['D2'], npz_tmp2['D3'], npz_tmp2['D4']] npz_tmp1.close() npz_tmp2.close() print "loaded stored all D data" else: self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] print "initialized D data" def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp2 = self.Q_func(s_dash) tmp2 = list(map(np.argmax, tmp2.data.get())) # argmaxQ(s',a) tmp = self.Q_func_target(s_dash) # Q'(s',*) tmp = list(tmp.data.get()) # select Q'(s',*) due to argmaxQ(s',a) res1 = [] for i in range(num_of_batch): res1.append(tmp[i][tmp2[i]]) #max_Q_dash = np.asanyarray(tmp, dtype=np.float32) max_Q_dash = np.asanyarray(res1, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): print 'now Q_func is implemented' h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h3)) # left side connected with s value h5 = F.relu(self.model.l5(h3)) # right side connected with A value h6 = self.model.l6(h4) # s value h7 = self.model.l7(h5) # A value Q = self.model.q_value(h6, h7) # Q value return Q def Q_func_target(self, state): print 'now Q_func_target is implemented' h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value h6 = self.model_target.l6(h4) # s value h7 = self.model_target.l7(h5) # A value Q = self.model_target.q_value(h6, h7) # Q value return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 50000 # 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10 ** 4 # Target update frequancy. original: 10^4 data_size = 5 * (10 ** 5) # Data size of history. original: 10^6 field_num = 7 field_size = 17 def __init__(self, control_size=10, field_num=7, field_size=17): self.num_of_actions = control_size self.field_size = field_size # self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() self.field_num = field_num print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(self.field_num * 4, 16, ksize=5, stride=1, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(16, 24, ksize=4, stride=1, nobias=False, wscale=np.sqrt(2)), l3=F.Linear(2400, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32)) ).to_gpu() self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) # self.optimizer.setup(self.model.collect_parameters()) self.optimizer.setup(self.model) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.float32), np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) # action_index = self.action_to_index(action[i]) target[i, action[i]] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error # td = Variable(target) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) #print "td_data " + str(td_clip.data) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) # zero_val = Variable(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) ##修正なし # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def Q_func_target(self, state): h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) Q = self.model_target.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return index_action, Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def save_model(self, model_name, opt_name): serializers.save_hdf5(model_name, self.model) serializers.save_hdf5(opt_name, self.optimizer) def read_model(self, model_name, opt_name): serializers.load_hdf5(model_name, self.model) serializers.load_hdf5(opt_name, self.optimizer) self.model_target = copy.deepcopy(self.model)