def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'): """ :param n_inputs: dimension of inputs :param n_hidden: dimension of hidden layer :param n_output: dimension of output (token) :param activation: either sigmoid or tanh """ super().__init__() self.n_inputs = n_inputs self.n_hidden = n_hidden self.n_output = n_output if activation == 'sigmoid': self.activation = Sigmoid() elif activation == 'tanh': self.activation = Tanh() else: raise Exception("Non-linearity not found") self.w_ih = Linear(n_inputs, n_hidden) self.w_hh = Linear(n_hidden, n_hidden) self.w_ho = Linear(n_hidden, n_output) self.parameters += self.w_ih.get_parameters() self.parameters += self.w_hh.get_parameters() self.parameters += self.w_ho.get_parameters()
def _init_params(self): self._dv = {} self.conv_dilation = Conv1D( stride=1, pad="causal", init=self.init, kernel_width=2, dilation=self.dilation, out_ch=self.ch_dilation, optimizer=self.optimizer, act_fn=Affine(slope=1, intercept=0), ) self.tanh = Tanh() self.sigm = Sigmoid() self.multiply_gate = Multiply(act_fn=Affine(slope=1, intercept=0)) self.conv_1x1 = Conv1D( stride=1, pad="same", dilation=0, init=self.init, kernel_width=1, out_ch=self.ch_residual, optimizer=self.optimizer, act_fn=Affine(slope=1, intercept=0), ) self.add_residual = Add(act_fn=Affine(slope=1, intercept=0)) self.add_skip = Add(act_fn=Affine(slope=1, intercept=0))
def set_params(self, summary_dict): cids = self.hyperparameters["component_ids"] for k, v in summary_dict["parameters"].items(): if k == "components": for c, cd in summary_dict["parameters"][k].items(): if c in cids: getattr(self, c).set_params(cd) elif k in self.parameters: self.parameters[k] = v for k, v in summary_dict["hyperparameters"].items(): if k == "components": for c, cd in summary_dict["hyperparameters"][k].items(): if c in cids: getattr(self, c).set_params(cd) if k in self.hyperparameters: if k == "act_fn" and v == "ReLU": self.hyperparameters[k] = ReLU() elif v == "act_fn" and v == "Sigmoid": self.hyperparameters[k] = Sigmoid() elif v == "act_fn" and v == "Tanh": self.hyperparameters[k] = Tanh() elif v == "act_fn" and "Affine" in v: r = r"Affine\(slope=(.*), intercept=(.*)\)" slope, intercept = re.match(r, v).groups() self.hyperparameters[k] = Affine(float(slope), float(intercept)) elif v == "act_fn" and "Leaky ReLU" in v: r = r"Leaky ReLU\(alpha=(.*)\)" alpha = re.match(r, v).groups()[0] self.hyperparameters[k] = LeakyReLU(float(alpha)) else: self.hyperparameters[k] = v
def test_tanh_grad(N=None): from activations import Tanh N = np.inf if N is None else N mine = Tanh() gold = torch_gradient_generator(torch.tanh) i = 0 while i < N: n_ex = np.random.randint(1, 100) n_dims = np.random.randint(1, 100) z = random_tensor((n_ex, n_dims)) assert_almost_equal(mine.grad(z), gold(z)) print("PASSED") i += 1
def build_model(): model = Sequential(MSE(), input_size=2) model.add_layer(Linear(2, 25)) model.add_layer(ReLU(25)) model.add_layer(Linear(25, 25)) model.add_layer(ReLU(25)) model.add_layer(Linear(25, 25)) model.add_layer(Tanh(25)) model.add_layer(Linear(25, 2)) return model
def __init__(self) -> None: super().__init__() self.activation = Tanh() #self.layer1 = self.Conv2D((1, 28, 28), (8, 3, 3), 1) #self.layer2 = self.MaxPool() #self.layer3 = self.Conv2D((2, 3, 3), 2) self.layer4 = Linear(784, 16) self.layer5 = Linear(16, 16) self.layer6 = Linear(16, 10)
def main(): # 先讀取資料,並建立模型。 # 輸入的維度為一個資料的長度,因為資料量小,batch size即為資料總數。 x, y = ParityBits(8).load_data() batch_size, input_dim = x.shape model = Sequential( [Dense(64, activation=ReLU()), Dense(32, activation=Tanh()), Dense(16, activation=Tanh()), Dense(4, activation=None), Dense(1, activation=Sigmoid())], input_dim=input_dim, # 使用GD為優化器,MSE為損失函式。 optimizer=GradientDescent(learning_rate=0.01, momentum=0.0), loss=MeanSquaredError()) # 設定好epochs後訓練模型,訓練完後取得預測結果和每個epoch的損失值。 y_pred, losses = model.train( x, y, batch_size=batch_size, epochs=200, verbose_step=10) # 因為答案皆為整數0或1,因此訓練的成果為模型預測的結果取整數。 result = np.around(y_pred).astype(int) # 將答案與訓練成果相減。 diff = np.subtract(y, result) print(pd.DataFrame({ # 印出表格時,須將輸入的資料的每項陣列例如`[0 0 0 0 0 0 0 0]`轉成字串, # 因為Pandas的DataFrame的每一項不能吃陣列。 "Data": [np.array_str(v) for v in x], "Answer": y[:, 0], "Prediction": [f'{v:.8f}' for v in y_pred[:, 0]], "Result": result[:, 0], # 如果答案與訓練成果在相減之後為0的話代表預測正確,否則失敗。 "Correct": [True if v == 0 else False for v in diff[:, 0]] }, index=np.arange(1, len(x) + 1)).to_string()) # 輸出最後的損失值和訓練成果與答案差了幾項,並繪製每個epoch與其損失值的變化圖表。 print(f'loss: {losses[-1]:.8f}, difference: {np.count_nonzero(diff)}') plt.figure(figsize=(8, 4)) plt.plot(losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show()
def main(): print('Loading data') data=Dataset("mnist_train.csv","mnist_test.csv",0.10,10) dset=data.create_dataset() print('Data loaded') image_size = 28 # width and length of mnist image num_labels = 10 # i.e. 0, 1, 2, 3, ..., 9, since mnist has 10 classes. image_pixels = image_size * image_size # create hidden layes list, the list length should be equal to number of layers and the numbers should correspond to number of hidden neurons in each layer. hiddens = [128,128,64]# this gives 3 hidden layes of size 128,128 and 64 respectively #since cross entropy is used the last activation layer should be identity. # size of activation list should be equal to len(hidden)+1 with last layer as identity for cross entrpoy activations = [Sigmoid(), Tanh(), Sigmoid(), Identity()] lr = 0.1 num_epochs = 100 batch_size = 784 # build your MLP model mlp = MLP( input_size=image_pixels, output_size=num_labels, hiddens=hiddens, activations=activations, criterion=SoftmaxCrossEntropy(), lr=lr ) # train the neural network t=train_test(mlp, dset, num_epochs, batch_size) t.train_network() #create plots t.plots() #test network t.test_network() #save trained weights. t.save('weights.npz')
class RNNCell(Layer): """ Vanilla RNN implementation Hidden(t) = Activation(Linear(Hidden(t-1) + Linear(Input(t))) Output(t) = Linear(Hidden(t)) """ def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'): """ :param n_inputs: dimension of inputs :param n_hidden: dimension of hidden layer :param n_output: dimension of output (token) :param activation: either sigmoid or tanh """ super().__init__() self.n_inputs = n_inputs self.n_hidden = n_hidden self.n_output = n_output if activation == 'sigmoid': self.activation = Sigmoid() elif activation == 'tanh': self.activation = Tanh() else: raise Exception("Non-linearity not found") self.w_ih = Linear(n_inputs, n_hidden) self.w_hh = Linear(n_hidden, n_hidden) self.w_ho = Linear(n_hidden, n_output) self.parameters += self.w_ih.get_parameters() self.parameters += self.w_hh.get_parameters() self.parameters += self.w_ho.get_parameters() def forward(self, input_tensor, hidden): """ Forward prop - returns both the output and the hidden """ from_prev_hidden = self.w_hh.forward(hidden) combined = self.w_ih.forward(input_tensor) + from_prev_hidden new_hidden = self.activation.forward(combined) output = self.w_ho.forward(new_hidden) return output, new_hidden def init_hidden(self, batch_size=1): """ First hidden state is all zeros""" return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
def __init__( self, n_out, act_fn=None, gate_fn=None, merge_mode="concat", init="glorot_uniform", optimizer=None, ): """ A single bidirectional long short-term memory (LSTM) layer. Parameters ---------- n_out : int The dimension of a single hidden state / output on a given timestep act_fn : `activations.Activation` instance (default: None) The activation function for computing A[t]. If not specified, use Tanh by default. gate_fn : `activations.Activation` instance (default: None) The gate function for computing the update, forget, and output gates. If not specified, use Sigmoid by default. merge_mode : str (default: "concat") Mode by which outputs of the forward and backward LSTMs will be combined. Valid values are {"sum", "multiply", "concat", "average"}. init : str (default: 'glorot_uniform') The weight initialization strategy. Valid entries are {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'} optimizer : str or `OptimizerBase` instance (default: None) The optimization strategy to use when performing gradient updates within the `update` method. If `None`, use the `SGD` optimizer with default parameters. """ super().__init__() self.init = init self.n_in = None self.n_out = n_out self.optimizer = optimizer self.merge_mode = merge_mode self.act_fn = Tanh() if act_fn is None else act_fn self.gate_fn = Sigmoid() if gate_fn is None else gate_fn self._init_params()
def init_from_str(self, act_str): act_str = act_str.lower() if act_str == "relu": act_fn = ReLU() elif act_str == "tanh": act_fn = Tanh() elif act_str == "sigmoid": act_fn = Sigmoid() elif "affine" in act_str: r = r"affine\(slope=(.*), intercept=(.*)\)" slope, intercept = re.match(r, act_str).groups() act_fn = Affine(float(slope), float(intercept)) elif "leaky relu" in act_str: r = r"leaky relu\(alpha=(.*)\)" alpha = re.match(r, act_str).groups()[0] act_fn = LeakyReLU(float(alpha)) else: raise ValueError("Unknown activation: {}".format(act_str)) return act_fn
def plot_activations(): fig, axes = plt.subplots(2, 3, sharex=True, sharey=True) fns = [Affine(), Tanh(), Sigmoid(), ReLU(), LeakyReLU(), ELU()] for ax, fn in zip(axes.flatten(), fns): X = np.linspace(-3, 3, 100).astype(float).reshape(100, 1) ax.plot(X, fn(X), label=r"$y$", alpha=0.7) ax.plot(X, fn.grad(X), label=r"$\frac{dy}{dx}$", alpha=0.7) ax.plot(X, fn.grad2(X), label=r"$\frac{d^2 y}{dx^2}$", alpha=0.7) ax.hlines(0, -3, 3, lw=1, linestyles="dashed", color="k") ax.vlines(0, -1.2, 1.2, lw=1, linestyles="dashed", color="k") ax.set_ylim(-1.1, 1.1) ax.set_xlim(-3, 3) ax.set_xticks([]) ax.set_yticks([-1, 0, 1]) ax.xaxis.set_visible(False) # ax.yaxis.set_visible(False) ax.set_title("{}".format(fn)) ax.legend(frameon=False) sns.despine(left=True, bottom=True) fig.set_size_inches(8, 5) plt.tight_layout() plt.savefig("plot.png", dpi=300) plt.close("all")
def cross_val_results(verbose=True): """ Function for generating the accuracy results of four models presented in the report with their best parameters, averaged over 10 runs and using different combinations of the available optimizers and loss :param verbose: whether to print average results for each (Model, Optimizer, Loss) combination, boolean, optional, default is True :returns: list of tuples containing (mean, std) of each (Model, Optimizer, Loss) combination, each tuple in [0, 1]^2 """ datasets = [] for i in range(10): datasets.append((generate_disc_set(1000), generate_disc_set(1000))) relu_model = Sequential(Linear(2, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 2), xavier_init=True) leaky_relu_model = Sequential(Linear(2, 25), LeakyReLU(), Linear(25, 25), LeakyReLU(), Linear(25, 25), LeakyReLU(), Linear(25, 2), xavier_init=True) tanh_model = Sequential(Linear(2, 25), Tanh(), Linear(25, 25), Tanh(), Linear(25, 25), Tanh(), Linear(25, 2), xavier_init=True) sigmoid_model = Sequential(Linear(2, 25), Sigmoid(), Linear(25, 25), Sigmoid(), Linear(25, 25), Sigmoid(), Linear(25, 2)) models = [relu_model, leaky_relu_model, tanh_model, sigmoid_model] final_scores = [] optimizers_names = ["SGD", "Adam"] models_names = ["ReLU", "Leaky", "Tanh", "Sigmoid"] losses_names = ["MSE", "CrossEntropy"] losses = [LossMSE(), LossCrossEntropy()] adam_params = {"ReLU": {"lr": 0.001, "b1": 0.9, "b2": 0.999, "epsilon": 1e-08}, "Leaky": {"lr": 0.001, "b1": 0.9, "b2": 0.999, "epsilon": 1e-08}, "Tanh": {"lr": 0.001, "b1": 0.9, "b2": 0.999, "epsilon": 1e-08}, "Sigmoid": {"lr": 0.001, "b1": 0.9, "b2": 0.999, "epsilon": 1e-08}} sgd_params = {"ReLU": {"lr": 0.001}, "Leaky": {"lr": 0.001}, "Tanh": {"lr": 0.001}, "Sigmoid": {"lr": 0.01}} for optim_name in optimizers_names: for loss_name, loss in zip(losses_names, losses): for model_name, model in zip(models_names, models): if verbose: print("Validating model {} with {} and {} loss...".format(model_name, optim_name, loss_name), end='') scores = [] if optim_name == "Adam": params = adam_params[model_name] optim = Adam(model, criterion=loss, nb_epochs=50, mini_batch_size=10, lr=params["lr"], b1=params["b1"], b2=params["b2"], epsilon=params["epsilon"]) else: params = sgd_params[model_name] optim = SGD(relu_model, criterion=loss, nb_epochs=50, mini_batch_size=10, lr=params["lr"]) for ((train_input, train_target), (test_input, test_target)) in datasets: optim.model = copy.deepcopy(model) optim.train(train_input, train_target, verbose=False) evaluator = Evaluator(optim.model) accuracy = evaluator.compute_accuracy(test_input, test_target) scores.append(accuracy) scores = torch.FloatTensor(scores) scores_mean = torch.mean(scores).item() scores_var = torch.std(scores).item() if verbose: print("Score : {0:.3f} (+/- {1:.3f}) ".format(scores_mean, scores_var)) final_scores.append((scores_mean, scores_var)) return final_scores
x_train = x_train.astype('float32') x_train /= 255 # encode output which is a number in range [0,9] into a vector of size 10 # e.g. number 3 will become [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] y_train = np_utils.to_categorical(y_train) y_train = y_train.reshape(y_train.shape[0], 10, 1) # same for test data: 10000 samples x_test = x_test.reshape(x_test.shape[0], 28 * 28, 1) x_test = x_test.astype('float32') x_test /= 255 y_test = np_utils.to_categorical(y_test) y_test = y_test.reshape(y_test.shape[0], 10, 1) # neural network network = [Dense(28 * 28, 40), Tanh(), Dense(40, 10), Tanh()] epochs = 100 learning_rate = 0.1 # train for e in range(epochs): error = 0 # train on 1000 samples, since we're not training on GPU... for x, y in zip(x_train[:1000], y_train[:1000]): # forward output = x for layer in network: output = layer.forward(output) # error
import matplotlib.pyplot as plt from activations import Sigmoid, ReLU, Tanh, Exponential from fullnetwork import onelayer, fullnetwork from mpl_toolkits.mplot3d import Axes3D #number of hidden layers# L = 3 #network size for each hidden layer n[0]=n_1, ..., n[L-1]=n_L# n = np.random.randint(1, 10, size=L) #training set size# training_size = 1 #(N, N) meshgrid# N = 3 #activation function# sigma = Tanh() #set the network# network = fullnetwork(L=L, n=n, activation=sigma) #set the initial weight and bias# weight, bias = network.setparameter() #choose one layer from from [1, L-1]# weightindex_startlayer = np.random.randint(1, L, size=None) #its next layer# weightindex_nextlayer = weightindex_startlayer + 1 #the two weights taken from randomly sample two neurons from each of the above layers, from [1, width of that layer]# weightindex_neuron_startlayer = np.random.randint( 1, n[weightindex_startlayer - 1] + 1, size=2) weightindex_neuron_nextlayer = np.random.randint(1, n[weightindex_nextlayer - 1] +
] #trainingIn = [trainingIn0] trainingOut0 = (np.cumsum(trainingIn0, axis=0) % 2)[:, 0, :] trainingOut1 = (np.cumsum(trainingIn1, axis=0) % 2)[:, 0, :] trainingOut2 = (np.cumsum(trainingIn2, axis=0) % 2)[:, 0, :] trainingOut3 = (np.cumsum(trainingIn3, axis=0) % 2)[:, 0, :] trainingOut4 = (np.cumsum(trainingIn4, axis=0) % 2)[:, 0, :] trainingOut5 = (np.cumsum(trainingIn5, axis=0) % 2)[:, 0, :] trainingOut = [ trainingOut1, trainingOut2, trainingOut3, trainingOut4, trainingOut5 ] #trainingOut = [trainingOut0] f, g, h = Logistic(), Logistic(), Tanh() lstm_layer1 = LSTMLayer(2, 4, f, g, h) lstm_layer2 = NNLayer(4, 4, Tanh(), usebias=False) lstm_layer3 = NNLayer(4, 1, Tanh(), usebias=False) #lstm_layer1 = LSTMLayerWeights(2, 4, f, g, h) #lstm_layer2 = NNLayer(4, 1, h) #d_weight1 = [np.zeros(w.shape) for w in lstm_layer1.to_weights_array()] #d_weight2 = [np.zeros(w.shape) for w in lstm_layer2.to_weights_array()] #d_weights = [d_weight1, d_weight2] #d_weights = [d_weight1] lstm = LSTMNetwork([lstm_layer1, lstm_layer2, lstm_layer3], loss=Squared()) #lstm = LSTMNetwork([lstm_layer1, nn_layer1]) """ for trial in range(200): #import pdb; pdb.set_trace()
val = df.drop(train.index) yr = train.iloc[:, 0].to_numpy() X_train, y_train = train.iloc[:, 1:].to_numpy() / 255.0, onehotcode(yr) X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1)) y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], 1)) print(X_train.shape, y_train.shape) X_val, y_val = val.iloc[:, 1:].to_numpy() / 255.0, val.iloc[:, 0].to_numpy() X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1)) print(X_val.shape, y_val.shape) nn = NN() nn.sequential(network=[784, 128, 10], activation=[Tanh(), Softmax()], loss=Cross_entropy(), regu=Ridge(n=X_train.shape[0], lmda=5), weight_type='glorot_normal') nn.load_model('tanL2128') #nn.fit(X_train,y_train,X_val,y_val,32,2) #nn.save_model('tanL2128') nn.weight_heatmap() df2 = pd.read_csv('mnist_test.csv') X_test, y_test = df2.iloc[:, 1:].to_numpy() / 255.0, df2.iloc[:, 0].to_numpy() X_test = X_test.reshape((X_test.shape[0], 28, 28)) print(X_test.shape, y_test.shape) nn.annote_test(X_test[:100], 10, 10)
test_input.sub_(mean).div_(std) #Convert to Labels so that we can train train_target_hot = conv_to_one_hot(train_target) test_target_hot = conv_to_one_hot(test_target) ### Build the Network hidden_layers = 3 layers = [] linear = Linear(2, 25, bias_init=True) layers.append(linear) layers.append(Relu()) for i in range(hidden_layers - 1): layers.append(Linear(25, 25, bias_init=True)) layers.append(Relu()) layers.append(Tanh()) layers.append(Linear(25, 2, bias_init=True)) model = Sequential(layers) #print model summary print("Model Summary:") print(model) ### Select Parameters to train the model criterion = MSE() lr = 0.05 nb_epochs = 250 print_step = 25 mini_batch_size = 100 loss_at_print = []
activation=LeakyReLU(0.2), optimizer=optimizer)) generator.addLayer( Dense(inputDim=256, outputDim=512, activation=LeakyReLU(0.2), optimizer=optimizer)) generator.addLayer( Dense(inputDim=512, outputDim=1024, activation=LeakyReLU(0.2), optimizer=optimizer)) generator.addLayer( Dense(inputDim=1024, outputDim=28 * 28, activation=Tanh(), optimizer=optimizer)) discriminator = MLP() discriminator.addLayer( Dense(inputDim=28 * 28, outputDim=1024, activation=LeakyReLU(0.2), optimizer=optimizer)) discriminator.addLayer( Dense(inputDim=1024, outputDim=512, activation=LeakyReLU(0.2), optimizer=optimizer)) discriminator.addLayer( Dense(inputDim=512,
from dense import Dense from activations import Tanh from losses import mse, mse_prime import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D X = np.reshape([[0, 0], [0, 1], [1, 0], [1, 1]], (4, 2, 1)) Y = np.reshape([[0], [1], [1], [0]], (4, 1, 1)) epochs = 10000 learning_rate = 0.1 network = [Dense(2, 3), Tanh(), Dense(3, 1), Tanh()] # train for e in range(epochs): error = 0 for x, y in zip(X, Y): # forward output = x for layer in network: output = layer.forward(output) # error error += mse(y, output) # backward grad = mse_prime(y, output) for layer in reversed(network):
class WavenetResidualModule(ModuleBase): def __init__( self, ch_residual, ch_dilation, dilation, kernel_width, optimizer=None, init="glorot_uniform", ): """ A WaveNet-like residual block with causal dilated convolutions. *Skip path in* >-------------------------------------------> + --------> *Skip path out* Causal |--> Tanh --| | *Main |--> Dilated Conv1D -| * --> 1x1 Conv1D --| path >--| |--> Sigm --| | in* |-------------------------------------------------> + --------> *Main path out* *Residual path* On the final block, the output of the skip path is further processed to produce the network predictions. See van den Oord et al. (2016) at https://arxiv.org/pdf/1609.03499.pdf for further details. Parameters ---------- ch_residual : int The number of output channels for the 1x1 Conv1D layer in the main path ch_dilation : int The number of output channels for the causal dilated Conv1D layer in the main path dilation : int The dilation rate for the causal dilated Conv1D layer in the main path kernel_width : int The width of the causal dilated Conv1D kernel in the main path init : str (default: 'glorot_uniform') The weight initialization strategy. Valid entries are {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'} optimizer : str or `OptimizerBase` instance (default: None) The optimization strategy to use when performing gradient updates within the `update` method. If `None`, use the `SGD` optimizer with default parameters. """ super().__init__() self.init = init self.dilation = dilation self.optimizer = optimizer self.ch_residual = ch_residual self.ch_dilation = ch_dilation self.kernel_width = kernel_width self._init_params() def _init_params(self): self._dv = {} self.conv_dilation = Conv1D( stride=1, pad="causal", init=self.init, kernel_width=2, dilation=self.dilation, out_ch=self.ch_dilation, optimizer=self.optimizer, act_fn=Affine(slope=1, intercept=0), ) self.tanh = Tanh() self.sigm = Sigmoid() self.multiply_gate = Multiply(act_fn=Affine(slope=1, intercept=0)) self.conv_1x1 = Conv1D( stride=1, pad="same", dilation=0, init=self.init, kernel_width=1, out_ch=self.ch_residual, optimizer=self.optimizer, act_fn=Affine(slope=1, intercept=0), ) self.add_residual = Add(act_fn=Affine(slope=1, intercept=0)) self.add_skip = Add(act_fn=Affine(slope=1, intercept=0)) @property def parameters(self): return { "components": { "conv_1x1": self.conv_1x1.parameters, "add_skip": self.add_skip.parameters, "add_residual": self.add_residual.parameters, "conv_dilation": self.conv_dilation.parameters, "multiply_gate": self.multiply_gate.parameters, } } @property def hyperparameters(self): return { "layer": "WavenetResidualModule", "init": self.init, "dilation": self.dilation, "optimizer": self.optimizer, "ch_residual": self.ch_residual, "ch_dilation": self.ch_dilation, "kernel_width": self.kernel_width, "component_ids": [ "conv_1x1", "add_skip", "add_residual", "conv_dilation", "multiply_gate", ], "components": { "conv_1x1": self.conv_1x1.hyperparameters, "add_skip": self.add_skip.hyperparameters, "add_residual": self.add_residual.hyperparameters, "conv_dilation": self.conv_dilation.hyperparameters, "multiply_gate": self.multiply_gate.hyperparameters, }, } @property def derived_variables(self): dv = { "conv_1x1_out": None, "conv_dilation_out": None, "multiply_gate_out": None, "components": { "conv_1x1": self.conv_1x1.derived_variables, "add_skip": self.add_skip.derived_variables, "add_residual": self.add_residual.derived_variables, "conv_dilation": self.conv_dilation.derived_variables, "multiply_gate": self.multiply_gate.derived_variables, }, } dv.update(self._dv) return dv @property def gradients(self): return { "components": { "conv_1x1": self.conv_1x1.gradients, "add_skip": self.add_skip.gradients, "add_residual": self.add_residual.gradients, "conv_dilation": self.conv_dilation.gradients, "multiply_gate": self.multiply_gate.gradients, } } def forward(self, X_main, X_skip=None): self.X_main, self.X_skip = X_main, X_skip conv_dilation_out = self.conv_dilation.forward(X_main) tanh_gate = self.tanh.fn(conv_dilation_out) sigm_gate = self.sigm.fn(conv_dilation_out) multiply_gate_out = self.multiply_gate.forward([tanh_gate, sigm_gate]) conv_1x1_out = self.conv_1x1.forward(multiply_gate_out) # if this is the first wavenet block, initialize the "previous" skip # connection sum to 0 self.X_skip = np.zeros_like(conv_1x1_out) if X_skip is None else X_skip Y_skip = self.add_skip.forward([X_skip, conv_1x1_out]) Y_main = self.add_residual.forward([X_main, conv_1x1_out]) self._dv["tanh_out"] = tanh_gate self._dv["sigm_out"] = sigm_gate self._dv["conv_dilation_out"] = conv_dilation_out self._dv["multiply_gate_out"] = multiply_gate_out self._dv["conv_1x1_out"] = conv_1x1_out return Y_main, Y_skip def backward(self, dY_skip, dY_main=None): dX_skip, dConv_1x1_out = self.add_skip.backward(dY_skip) # if this is the last wavenet block, dY_main will be None. if not, # calculate the error contribution from dY_main and add it to the # contribution from the skip path dX_main = np.zeros_like(self.X_main) if dY_main is not None: dX_main, dConv_1x1_main = self.add_residual.backward(dY_main) dConv_1x1_out += dConv_1x1_main dMultiply_out = self.conv_1x1.backward(dConv_1x1_out) dTanh_out, dSigm_out = self.multiply_gate.backward(dMultiply_out) conv_dilation_out = self.derived_variables["conv_dilation_out"] dTanh_in = dTanh_out * self.tanh.grad(conv_dilation_out) dSigm_in = dSigm_out * self.sigm.grad(conv_dilation_out) dDilation_out = dTanh_in + dSigm_in conv_back = self.conv_dilation.backward(dDilation_out) dX_main += conv_back self._dv["dLdTanh"] = dTanh_out self._dv["dLdSigmoid"] = dSigm_out self._dv["dLdConv_1x1"] = dConv_1x1_out self._dv["dLdMultiply"] = dMultiply_out self._dv["dLdConv_dilation"] = dDilation_out return dX_main, dX_skip
def main(): """ Function containing the main code definition, display all functionalities provided by the framework """ # Different activation functions and setting of automatic Xavier parameter initialization relu_model = Sequential(Linear(2, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 2), xavier_init=True) leaky_relu_model = Sequential(Linear(2, 25), LeakyReLU(), Linear(25, 25), LeakyReLU(), Linear(25, 25), LeakyReLU(), Linear(25, 2), xavier_init=True) tanh_model = Sequential(Linear(2, 25), Tanh(), Linear(25, 25), Tanh(), Linear(25, 25), Tanh(), Linear(25, 2), xavier_init=True) sigmoid_model = Sequential(Linear(2, 25), Sigmoid(), Linear(25, 25), Sigmoid(), Linear(25, 25), Sigmoid(), Linear(25, 2), xavier_init=False) model_names = ["ReLU", "Leaky", "Tanh", "Sigmoid"] train_input, train_target = generate_disc_set(1000) test_input, test_target = generate_disc_set(1000) # Model training without cross-validation of the optimizer parameters optimizer = SGDCV(leaky_relu_model, nb_epochs=25) optimizer.train(train_input, train_target) evaluator = Evaluator(leaky_relu_model) print("Train accuracy using LeakyReLU: {:.1f}%".format( (evaluator.compute_accuracy(train_input, train_target) * 100).item())) print("Test accuracy using LeakyReLU: {:.1f}%".format( (evaluator.compute_accuracy(test_input, test_target) * 100).item())) models = (relu_model, leaky_relu_model, tanh_model, sigmoid_model) sgd_cross_val_param_grid = {"lr": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]} adam_cross_val_param_grid = { "lr": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1], "b1": [0.9, 0.8], "b2": [0.999, 0.888], "epsilon": [1e-8, 1e-7, 1e-6] } adam_params = { "ReLU": { "lr": [0.001], "b1": [0.9], "b2": [0.999], "epsilon": [1e-08] }, "Leaky": { "lr": [0.001], "b1": [0.9], "b2": [0.999], "epsilon": [1e-08] }, "Tanh": { "lr": [0.001], "b1": [0.9], "b2": [0.999], "epsilon": [1e-08] }, "Sigmoid": { "lr": [0.001], "b1": [0.9], "b2": [0.999], "epsilon": [1e-08] } } sgd_params = { "ReLU": { "lr": [0.001] }, "Leaky": { "lr": [0.001] }, "Tanh": { "lr": [0.001] }, "Sigmoid": { "lr": [0.01] } } mse_loss = not args.CE optimizer_sgd = not args.Adam cross_validate = args.cross_val # Different loss functions if mse_loss: criterion = LossMSE() else: criterion = LossCrossEntropy() for name, model in zip(model_names, models): if optimizer_sgd: # SGD optimizer parameter cross-validation optimizer = SGDCV(model, mini_batch_size=10, criterion=criterion) if cross_validate: params = sgd_cross_val_param_grid else: params = sgd_params[name] cross_val_results, best_params_score = optimizer.cross_validate( values=params) print("Best params for model using {} : (lr={:.3f})".format( name, best_params_score["lr"])) else: # Adam optimizer parameter cross-validation optimizer = AdamCV(model, mini_batch_size=10, criterion=criterion) if cross_validate: params = adam_cross_val_param_grid else: params = adam_params[name] cross_val_results, best_params_score = optimizer.cross_validate( values=params) print( "Best params for model using {} : (lr={:.3f}, b1={:.3f}, b2={:.3f}, epsilon={:.1e})" .format(name, best_params_score["lr"], best_params_score["b1"], best_params_score["b2"], best_params_score["epsilon"])) print("Best score for model using {} : {:.3f} (+/- {:.3f})".format( name, best_params_score["mean"], best_params_score["std"]))
import numpy as np from tensor import Tensor from layers import Sequential, Linear from activations import Tanh, Sigmoid from optimizers import SGD from losses import MSELoss np.random.seed(0) data = Tensor(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), autograd=True) target = Tensor(np.array([[0], [1], [0], [1]]), autograd=True) model = Sequential([Linear(2, 3), Tanh(), Linear(3, 1), Sigmoid()]) criterion = MSELoss() optim = SGD(parameters=model.get_parameters(), alpha=1) for i in range(10): pred = model.forward(data) loss = criterion.forward(pred, target) loss.backward() optim.step() print(loss)