def _init_params(self): self._dv = {} self.conv_dilation = Conv1D( stride=1, pad="causal", init=self.init, kernel_width=2, dilation=self.dilation, out_ch=self.ch_dilation, optimizer=self.optimizer, act_fn=Affine(slope=1, intercept=0), ) self.tanh = Tanh() self.sigm = Sigmoid() self.multiply_gate = Multiply(act_fn=Affine(slope=1, intercept=0)) self.conv_1x1 = Conv1D( stride=1, pad="same", dilation=0, init=self.init, kernel_width=1, out_ch=self.ch_residual, optimizer=self.optimizer, act_fn=Affine(slope=1, intercept=0), ) self.add_residual = Add(act_fn=Affine(slope=1, intercept=0)) self.add_skip = Add(act_fn=Affine(slope=1, intercept=0))
def test_sigmoid_activation(N=None): from activations import Sigmoid N = np.inf if N is None else N mine = Sigmoid() gold = expit i = 0 while i < N: n_dims = np.random.randint(1, 100) z = random_tensor((1, n_dims)) assert_almost_equal(mine.fn(z), gold(z)) print("PASSED") i += 1
def set_params(self, summary_dict): cids = self.hyperparameters["component_ids"] for k, v in summary_dict["parameters"].items(): if k == "components": for c, cd in summary_dict["parameters"][k].items(): if c in cids: getattr(self, c).set_params(cd) elif k in self.parameters: self.parameters[k] = v for k, v in summary_dict["hyperparameters"].items(): if k == "components": for c, cd in summary_dict["hyperparameters"][k].items(): if c in cids: getattr(self, c).set_params(cd) if k in self.hyperparameters: if k == "act_fn" and v == "ReLU": self.hyperparameters[k] = ReLU() elif v == "act_fn" and v == "Sigmoid": self.hyperparameters[k] = Sigmoid() elif v == "act_fn" and v == "Tanh": self.hyperparameters[k] = Tanh() elif v == "act_fn" and "Affine" in v: r = r"Affine\(slope=(.*), intercept=(.*)\)" slope, intercept = re.match(r, v).groups() self.hyperparameters[k] = Affine(float(slope), float(intercept)) elif v == "act_fn" and "Leaky ReLU" in v: r = r"Leaky ReLU\(alpha=(.*)\)" alpha = re.match(r, v).groups()[0] self.hyperparameters[k] = LeakyReLU(float(alpha)) else: self.hyperparameters[k] = v
def test_sigmoid_grad(N=None): from activations import Sigmoid N = np.inf if N is None else N mine = Sigmoid() gold = torch_gradient_generator(torch.sigmoid) i = 0 while i < N: n_ex = np.random.randint(1, 100) n_dims = np.random.randint(1, 100) z = random_tensor((n_ex, n_dims)) assert_almost_equal(mine.grad(z), gold(z)) print("PASSED") i += 1
def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'): """ :param n_inputs: dimension of inputs :param n_hidden: dimension of hidden layer :param n_output: dimension of output (token) :param activation: either sigmoid or tanh """ super().__init__() self.n_inputs = n_inputs self.n_hidden = n_hidden self.n_output = n_output if activation == 'sigmoid': self.activation = Sigmoid() elif activation == 'tanh': self.activation = Tanh() else: raise Exception("Non-linearity not found") self.w_ih = Linear(n_inputs, n_hidden) self.w_hh = Linear(n_hidden, n_hidden) self.w_ho = Linear(n_hidden, n_output) self.parameters += self.w_ih.get_parameters() self.parameters += self.w_hh.get_parameters() self.parameters += self.w_ho.get_parameters()
def main(): print('Loading data') data=Dataset("mnist_train.csv","mnist_test.csv",0.10,10) dset=data.create_dataset() print('Data loaded') image_size = 28 # width and length of mnist image num_labels = 10 # i.e. 0, 1, 2, 3, ..., 9, since mnist has 10 classes. image_pixels = image_size * image_size # create hidden layes list, the list length should be equal to number of layers and the numbers should correspond to number of hidden neurons in each layer. hiddens = [128,128,64]# this gives 3 hidden layes of size 128,128 and 64 respectively #since cross entropy is used the last activation layer should be identity. # size of activation list should be equal to len(hidden)+1 with last layer as identity for cross entrpoy activations = [Sigmoid(), Tanh(), Sigmoid(), Identity()] lr = 0.1 num_epochs = 100 batch_size = 784 # build your MLP model mlp = MLP( input_size=image_pixels, output_size=num_labels, hiddens=hiddens, activations=activations, criterion=SoftmaxCrossEntropy(), lr=lr ) # train the neural network t=train_test(mlp, dset, num_epochs, batch_size) t.train_network() #create plots t.plots() #test network t.test_network() #save trained weights. t.save('weights.npz')
def __init__( self, L=1, #number of hidden layers# n=np.random.randint( 1, 6, size=1 ), #network size for each hidden layer n[0]=n_1, ..., m[L-1]=n_L# activation=Sigmoid()): self.L = L self.n = n self.activation = activation
def __init__(self, n_trees=100, classification=True, learning_rate=0.01, max_depth=10): self.n_trees = n_trees self.classification = classification self.learning_rate = learning_rate self.max_depth = max_depth self.X = None self.y = None self.weak_learners = [] self.preds = [] self.Sigmoid = Sigmoid()
def __init__( self, inputvector=[], #input vector, has to be array type# activation=Sigmoid(), #activation function, will be applied termwise# weight=[], #weights from input to output layer, should be an array of size outputsize x inputsize# bias=[], #bias vectors in the particular layer, a vector array of length = outputsize# ): self.inputvector = inputvector self.activation = activation self.weight = weight self.bias = bias
def __init__(self, out_channels: int, param_size: int, dropout: float = 1.0, weight_init: str = 'normal', activation: Operation = Sigmoid(), flatten: bool = False): super().__init__(out_channels) self.out_channels = out_channels self.param_size = param_size self.activation = activation self.flatten = flatten self.dropout = dropout self.weight_init = weight_init
def __init__(self, in_size, h_size, o_size): I, H, O = in_size, h_size, o_size W1 = np.random.randn(I, H) b1 = np.random.randn(H) W2 = np.random.randn(H, O) b2 = np.random.randn(O) self.layers = [Affine(W1, b1), Sigmoid(), Affine(W2, b2)] self.params = [] for layer in self.layers: self.params += layer.params
def _build_decoder(self): """ MLP decoder FC1 -> ReLU -> FC2 -> Sigmoid """ self.decoder = OrderedDict() self.decoder["FC1"] = FullyConnected( act_fn=ReLU(), init=self.init, n_out=self.latent_dim, optimizer=self.optimizer, ) # NB. `n_out` is dependent on the dimensionality of X. we use a # placeholder for now, and update it within the `forward` method self.decoder["FC2"] = FullyConnected( n_out=None, act_fn=Sigmoid(), optimizer=self.optimizer, init=self.init )
def __init__( self, L=1, #number of hidden layers# n=np.random.randint( 1, 6, size=1 ), #network size for each hidden layer n[0]=n_1, ..., m[L-1]=n_L# activation=Sigmoid(), weight=[], bias=[], outputsequence=[], preoutputsequence=[]): self.L = L self.n = n self.activation = activation self.weight = weight self.bias = bias self.outputsequence = outputsequence self.preoutputsequence = preoutputsequence
def __init__(self, inputDim = 1, outputDim = 1, activation = Sigmoid(), optimizer = Adam()): self.inputDim = inputDim self.outputDim = outputDim # set the activation function self.activation = activation # set optimizer self.weightOptimizer = copy.copy(optimizer) self.biasOptimizer = copy.copy(optimizer) # randomly initialize the weight and biases limit = np.sqrt(6 / (inputDim + outputDim)) # xavier uniform initializer self.weight = np.random.uniform(-limit, limit, (outputDim, inputDim)) self.bias = np.zeros(outputDim) # trainable decides whether weight and biases are trained in backward pass self.trainable = True # Layers can also be frozen !
def __init__( self, n_out, act_fn=None, gate_fn=None, merge_mode="concat", init="glorot_uniform", optimizer=None, ): """ A single bidirectional long short-term memory (LSTM) layer. Parameters ---------- n_out : int The dimension of a single hidden state / output on a given timestep act_fn : `activations.Activation` instance (default: None) The activation function for computing A[t]. If not specified, use Tanh by default. gate_fn : `activations.Activation` instance (default: None) The gate function for computing the update, forget, and output gates. If not specified, use Sigmoid by default. merge_mode : str (default: "concat") Mode by which outputs of the forward and backward LSTMs will be combined. Valid values are {"sum", "multiply", "concat", "average"}. init : str (default: 'glorot_uniform') The weight initialization strategy. Valid entries are {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'} optimizer : str or `OptimizerBase` instance (default: None) The optimization strategy to use when performing gradient updates within the `update` method. If `None`, use the `SGD` optimizer with default parameters. """ super().__init__() self.init = init self.n_in = None self.n_out = n_out self.optimizer = optimizer self.merge_mode = merge_mode self.act_fn = Tanh() if act_fn is None else act_fn self.gate_fn = Sigmoid() if gate_fn is None else gate_fn self._init_params()
def init_from_str(self, act_str): act_str = act_str.lower() if act_str == "relu": act_fn = ReLU() elif act_str == "tanh": act_fn = Tanh() elif act_str == "sigmoid": act_fn = Sigmoid() elif "affine" in act_str: r = r"affine\(slope=(.*), intercept=(.*)\)" slope, intercept = re.match(r, act_str).groups() act_fn = Affine(float(slope), float(intercept)) elif "leaky relu" in act_str: r = r"leaky relu\(alpha=(.*)\)" alpha = re.match(r, act_str).groups()[0] act_fn = LeakyReLU(float(alpha)) else: raise ValueError("Unknown activation: {}".format(act_str)) return act_fn
def __init__(self, n_iters=10000, hidden_activation=Sigmoid(), output_activation=Linear(), learning_rate=1e-2, n_hidden=10, loss=MSE(), mini_batch=10): self.n_iters = n_iters self.hidden_activation = hidden_activation self.output_activation = output_activation self.learning_rate = learning_rate self.W = None self.W0 = None self.V = None self.V0 = None self.mini_batch = mini_batch self.loss = loss self.X, self.y = None, None self.n_hidden = n_hidden
def main(): # 先讀取資料,並建立模型。 # 輸入的維度為一個資料的長度,因為資料量小,batch size即為資料總數。 x, y = ParityBits(8).load_data() batch_size, input_dim = x.shape model = Sequential( [Dense(64, activation=ReLU()), Dense(32, activation=Tanh()), Dense(16, activation=Tanh()), Dense(4, activation=None), Dense(1, activation=Sigmoid())], input_dim=input_dim, # 使用GD為優化器,MSE為損失函式。 optimizer=GradientDescent(learning_rate=0.01, momentum=0.0), loss=MeanSquaredError()) # 設定好epochs後訓練模型,訓練完後取得預測結果和每個epoch的損失值。 y_pred, losses = model.train( x, y, batch_size=batch_size, epochs=200, verbose_step=10) # 因為答案皆為整數0或1,因此訓練的成果為模型預測的結果取整數。 result = np.around(y_pred).astype(int) # 將答案與訓練成果相減。 diff = np.subtract(y, result) print(pd.DataFrame({ # 印出表格時,須將輸入的資料的每項陣列例如`[0 0 0 0 0 0 0 0]`轉成字串, # 因為Pandas的DataFrame的每一項不能吃陣列。 "Data": [np.array_str(v) for v in x], "Answer": y[:, 0], "Prediction": [f'{v:.8f}' for v in y_pred[:, 0]], "Result": result[:, 0], # 如果答案與訓練成果在相減之後為0的話代表預測正確,否則失敗。 "Correct": [True if v == 0 else False for v in diff[:, 0]] }, index=np.arange(1, len(x) + 1)).to_string()) # 輸出最後的損失值和訓練成果與答案差了幾項,並繪製每個epoch與其損失值的變化圖表。 print(f'loss: {losses[-1]:.8f}, difference: {np.count_nonzero(diff)}') plt.figure(figsize=(8, 4)) plt.plot(losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show()
def plot_activations(): fig, axes = plt.subplots(2, 3, sharex=True, sharey=True) fns = [Affine(), Tanh(), Sigmoid(), ReLU(), LeakyReLU(), ELU()] for ax, fn in zip(axes.flatten(), fns): X = np.linspace(-3, 3, 100).astype(float).reshape(100, 1) ax.plot(X, fn(X), label=r"$y$", alpha=0.7) ax.plot(X, fn.grad(X), label=r"$\frac{dy}{dx}$", alpha=0.7) ax.plot(X, fn.grad2(X), label=r"$\frac{d^2 y}{dx^2}$", alpha=0.7) ax.hlines(0, -3, 3, lw=1, linestyles="dashed", color="k") ax.vlines(0, -1.2, 1.2, lw=1, linestyles="dashed", color="k") ax.set_ylim(-1.1, 1.1) ax.set_xlim(-3, 3) ax.set_xticks([]) ax.set_yticks([-1, 0, 1]) ax.xaxis.set_visible(False) # ax.yaxis.set_visible(False) ax.set_title("{}".format(fn)) ax.legend(frameon=False) sns.despine(left=True, bottom=True) fig.set_size_inches(8, 5) plt.tight_layout() plt.savefig("plot.png", dpi=300) plt.close("all")
def cross_val_results(verbose=True): """ Function for generating the accuracy results of four models presented in the report with their best parameters, averaged over 10 runs and using different combinations of the available optimizers and loss :param verbose: whether to print average results for each (Model, Optimizer, Loss) combination, boolean, optional, default is True :returns: list of tuples containing (mean, std) of each (Model, Optimizer, Loss) combination, each tuple in [0, 1]^2 """ datasets = [] for i in range(10): datasets.append((generate_disc_set(1000), generate_disc_set(1000))) relu_model = Sequential(Linear(2, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 2), xavier_init=True) leaky_relu_model = Sequential(Linear(2, 25), LeakyReLU(), Linear(25, 25), LeakyReLU(), Linear(25, 25), LeakyReLU(), Linear(25, 2), xavier_init=True) tanh_model = Sequential(Linear(2, 25), Tanh(), Linear(25, 25), Tanh(), Linear(25, 25), Tanh(), Linear(25, 2), xavier_init=True) sigmoid_model = Sequential(Linear(2, 25), Sigmoid(), Linear(25, 25), Sigmoid(), Linear(25, 25), Sigmoid(), Linear(25, 2)) models = [relu_model, leaky_relu_model, tanh_model, sigmoid_model] final_scores = [] optimizers_names = ["SGD", "Adam"] models_names = ["ReLU", "Leaky", "Tanh", "Sigmoid"] losses_names = ["MSE", "CrossEntropy"] losses = [LossMSE(), LossCrossEntropy()] adam_params = {"ReLU": {"lr": 0.001, "b1": 0.9, "b2": 0.999, "epsilon": 1e-08}, "Leaky": {"lr": 0.001, "b1": 0.9, "b2": 0.999, "epsilon": 1e-08}, "Tanh": {"lr": 0.001, "b1": 0.9, "b2": 0.999, "epsilon": 1e-08}, "Sigmoid": {"lr": 0.001, "b1": 0.9, "b2": 0.999, "epsilon": 1e-08}} sgd_params = {"ReLU": {"lr": 0.001}, "Leaky": {"lr": 0.001}, "Tanh": {"lr": 0.001}, "Sigmoid": {"lr": 0.01}} for optim_name in optimizers_names: for loss_name, loss in zip(losses_names, losses): for model_name, model in zip(models_names, models): if verbose: print("Validating model {} with {} and {} loss...".format(model_name, optim_name, loss_name), end='') scores = [] if optim_name == "Adam": params = adam_params[model_name] optim = Adam(model, criterion=loss, nb_epochs=50, mini_batch_size=10, lr=params["lr"], b1=params["b1"], b2=params["b2"], epsilon=params["epsilon"]) else: params = sgd_params[model_name] optim = SGD(relu_model, criterion=loss, nb_epochs=50, mini_batch_size=10, lr=params["lr"]) for ((train_input, train_target), (test_input, test_target)) in datasets: optim.model = copy.deepcopy(model) optim.train(train_input, train_target, verbose=False) evaluator = Evaluator(optim.model) accuracy = evaluator.compute_accuracy(test_input, test_target) scores.append(accuracy) scores = torch.FloatTensor(scores) scores_mean = torch.mean(scores).item() scores_var = torch.std(scores).item() if verbose: print("Score : {0:.3f} (+/- {1:.3f}) ".format(scores_mean, scores_var)) final_scores.append((scores_mean, scores_var)) return final_scores
if __name__ == "__main__": x = np.array([[0.05, .1]]) W1 = np.array([[.15, .20], [.25, .30]]) W2 = np.array([[.40, .45], [.50, .55]]) b1 = .35 b2 = 0.60 y_true = np.array([[.01, .99]]) #Layers Generation dense = Dense(2, W1, b1) dense2 = Dense(2, W2, b2) activation1 = Sigmoid() # activation2=Sigmoid() activation2 = Activation("sigmoid") loss_func = MSE() #Forward Pass # Dense -> Activation -> Dense -> Activation -> y_pred z1 = dense.forward(x) a1 = activation1.forward(z1) print("Activation Value:", a1) z2 = dense2.forward(a1) a2 = activation2.forward(z2) y_pred = a2
def __init__(self, n_iter=1000, learning_rate=1e-4, intercept=False): self.weights = None self.n_iter = n_iter self.learning_rate = learning_rate self.intercept = intercept self.sigmoid = Sigmoid()
x = x.reshape(len(x), 1, 28, 28) x = x.astype("float32") / 255 y = np_utils.to_categorical(y) y = y.reshape(len(y), 2, 1) return x, y # load MNIST from server, limit to 100 images per class since we're not training on GPU (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, y_train = preprocess_data(x_train, y_train, 100) x_test, y_test = preprocess_data(x_test, y_test, 100) # neural network network = [ Convolutional((1, 28, 28), 3, 5), Sigmoid(), Reshape((5, 26, 26), (5 * 26 * 26, 1)), Dense(5 * 26 * 26, 100), Sigmoid(), Dense(100, 2), Sigmoid() ] epochs = 20 learning_rate = 0.1 # train for e in range(epochs): error = 0 for x, y in zip(x_train, y_train): # forward
import numpy as np import matplotlib.pyplot as plt from activations import Sigmoid, ReLU, Tanh, Exponential from fullnetwork import onelayer, fullnetwork from backpropagation import backpropagation from mpl_toolkits.mplot3d import Axes3D from matplotlib import animation #number of hidden layers# L = 3 #network size for each hidden layer n[0]=n_1, ..., n[L-1]=n_L# n = np.random.randint(1, 5, size=L) #activation function# sigma = Sigmoid() #number of iterations# N = 100 #set the network# network = fullnetwork(L=L, n=n, activation=sigma) #set the initial weight and bias# weight, bias = network.setparameter() #choose one layer from from [1, L-1]# weightindex_startlayer = np.random.randint(1, L, size=None) #its next layer# weightindex_nextlayer = weightindex_startlayer + 1 #the two weights taken from randomly sample two neurons from each of the above layers, from [1, width of that layer]# weightindex_neuron_startlayer = np.random.randint( 1, n[weightindex_startlayer - 1] + 1, size=2)
def __init__(self): self.sigmoid = Sigmoid()
loss = self.loss_func.forward(l, o) losses.append(self.loss_func.backward(l, o)) self.total_loss += loss layer['out_losses'] = losses if __name__ == '__main__': from estimators import MSELoss as MSE from activations import PRelu, Sigmoid from initors import GaussInitor import os TRAIN = True model_file = './nnmode.ckpt' relu = PRelu(0) sigmoid = Sigmoid(0) mse = MSE() initor = GaussInitor() activations = {'prelu': relu} NET = NeuralNetwork(input_num=2, lr=0.01, activations=activations, loss_obj=mse, initor_obj=initor) if os.path.isfile(model_file) and not TRAIN: NET.load_model(model_file) else: NET.add_layer( 'hidden', 2, activation='prelu',
def __init__(self, neurons: int, activation: Operation = Sigmoid(), dropout: float = 1.0, weight_init: str = 'standard'): super().__init__(neurons) self.dropout = dropout self.activation = activation self.weight_init = weight_init
outputDim=1024, activation=LeakyReLU(0.2), optimizer=optimizer)) discriminator.addLayer( Dense(inputDim=1024, outputDim=512, activation=LeakyReLU(0.2), optimizer=optimizer)) discriminator.addLayer( Dense(inputDim=512, outputDim=256, activation=LeakyReLU(0.2), optimizer=optimizer)) discriminator.addLayer( Dense(inputDim=256, outputDim=1, activation=Sigmoid(), optimizer=optimizer)) gan = GAN(generator, discriminator) print(gan) gan.train(dataset, loss=MSE(), epochs=50, metrics=["generator_loss", "discriminator_loss"], tensorboard=True, callbacks=[]) gan.save("tryout_gan")
def main(): """ Function containing the main code definition, display all functionalities provided by the framework """ # Different activation functions and setting of automatic Xavier parameter initialization relu_model = Sequential(Linear(2, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 2), xavier_init=True) leaky_relu_model = Sequential(Linear(2, 25), LeakyReLU(), Linear(25, 25), LeakyReLU(), Linear(25, 25), LeakyReLU(), Linear(25, 2), xavier_init=True) tanh_model = Sequential(Linear(2, 25), Tanh(), Linear(25, 25), Tanh(), Linear(25, 25), Tanh(), Linear(25, 2), xavier_init=True) sigmoid_model = Sequential(Linear(2, 25), Sigmoid(), Linear(25, 25), Sigmoid(), Linear(25, 25), Sigmoid(), Linear(25, 2), xavier_init=False) model_names = ["ReLU", "Leaky", "Tanh", "Sigmoid"] train_input, train_target = generate_disc_set(1000) test_input, test_target = generate_disc_set(1000) # Model training without cross-validation of the optimizer parameters optimizer = SGDCV(leaky_relu_model, nb_epochs=25) optimizer.train(train_input, train_target) evaluator = Evaluator(leaky_relu_model) print("Train accuracy using LeakyReLU: {:.1f}%".format( (evaluator.compute_accuracy(train_input, train_target) * 100).item())) print("Test accuracy using LeakyReLU: {:.1f}%".format( (evaluator.compute_accuracy(test_input, test_target) * 100).item())) models = (relu_model, leaky_relu_model, tanh_model, sigmoid_model) sgd_cross_val_param_grid = {"lr": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]} adam_cross_val_param_grid = { "lr": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1], "b1": [0.9, 0.8], "b2": [0.999, 0.888], "epsilon": [1e-8, 1e-7, 1e-6] } adam_params = { "ReLU": { "lr": [0.001], "b1": [0.9], "b2": [0.999], "epsilon": [1e-08] }, "Leaky": { "lr": [0.001], "b1": [0.9], "b2": [0.999], "epsilon": [1e-08] }, "Tanh": { "lr": [0.001], "b1": [0.9], "b2": [0.999], "epsilon": [1e-08] }, "Sigmoid": { "lr": [0.001], "b1": [0.9], "b2": [0.999], "epsilon": [1e-08] } } sgd_params = { "ReLU": { "lr": [0.001] }, "Leaky": { "lr": [0.001] }, "Tanh": { "lr": [0.001] }, "Sigmoid": { "lr": [0.01] } } mse_loss = not args.CE optimizer_sgd = not args.Adam cross_validate = args.cross_val # Different loss functions if mse_loss: criterion = LossMSE() else: criterion = LossCrossEntropy() for name, model in zip(model_names, models): if optimizer_sgd: # SGD optimizer parameter cross-validation optimizer = SGDCV(model, mini_batch_size=10, criterion=criterion) if cross_validate: params = sgd_cross_val_param_grid else: params = sgd_params[name] cross_val_results, best_params_score = optimizer.cross_validate( values=params) print("Best params for model using {} : (lr={:.3f})".format( name, best_params_score["lr"])) else: # Adam optimizer parameter cross-validation optimizer = AdamCV(model, mini_batch_size=10, criterion=criterion) if cross_validate: params = adam_cross_val_param_grid else: params = adam_params[name] cross_val_results, best_params_score = optimizer.cross_validate( values=params) print( "Best params for model using {} : (lr={:.3f}, b1={:.3f}, b2={:.3f}, epsilon={:.1e})" .format(name, best_params_score["lr"], best_params_score["b1"], best_params_score["b2"], best_params_score["epsilon"])) print("Best score for model using {} : {:.3f} (+/- {:.3f})".format( name, best_params_score["mean"], best_params_score["std"]))
class WavenetResidualModule(ModuleBase): def __init__( self, ch_residual, ch_dilation, dilation, kernel_width, optimizer=None, init="glorot_uniform", ): """ A WaveNet-like residual block with causal dilated convolutions. *Skip path in* >-------------------------------------------> + --------> *Skip path out* Causal |--> Tanh --| | *Main |--> Dilated Conv1D -| * --> 1x1 Conv1D --| path >--| |--> Sigm --| | in* |-------------------------------------------------> + --------> *Main path out* *Residual path* On the final block, the output of the skip path is further processed to produce the network predictions. See van den Oord et al. (2016) at https://arxiv.org/pdf/1609.03499.pdf for further details. Parameters ---------- ch_residual : int The number of output channels for the 1x1 Conv1D layer in the main path ch_dilation : int The number of output channels for the causal dilated Conv1D layer in the main path dilation : int The dilation rate for the causal dilated Conv1D layer in the main path kernel_width : int The width of the causal dilated Conv1D kernel in the main path init : str (default: 'glorot_uniform') The weight initialization strategy. Valid entries are {'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'} optimizer : str or `OptimizerBase` instance (default: None) The optimization strategy to use when performing gradient updates within the `update` method. If `None`, use the `SGD` optimizer with default parameters. """ super().__init__() self.init = init self.dilation = dilation self.optimizer = optimizer self.ch_residual = ch_residual self.ch_dilation = ch_dilation self.kernel_width = kernel_width self._init_params() def _init_params(self): self._dv = {} self.conv_dilation = Conv1D( stride=1, pad="causal", init=self.init, kernel_width=2, dilation=self.dilation, out_ch=self.ch_dilation, optimizer=self.optimizer, act_fn=Affine(slope=1, intercept=0), ) self.tanh = Tanh() self.sigm = Sigmoid() self.multiply_gate = Multiply(act_fn=Affine(slope=1, intercept=0)) self.conv_1x1 = Conv1D( stride=1, pad="same", dilation=0, init=self.init, kernel_width=1, out_ch=self.ch_residual, optimizer=self.optimizer, act_fn=Affine(slope=1, intercept=0), ) self.add_residual = Add(act_fn=Affine(slope=1, intercept=0)) self.add_skip = Add(act_fn=Affine(slope=1, intercept=0)) @property def parameters(self): return { "components": { "conv_1x1": self.conv_1x1.parameters, "add_skip": self.add_skip.parameters, "add_residual": self.add_residual.parameters, "conv_dilation": self.conv_dilation.parameters, "multiply_gate": self.multiply_gate.parameters, } } @property def hyperparameters(self): return { "layer": "WavenetResidualModule", "init": self.init, "dilation": self.dilation, "optimizer": self.optimizer, "ch_residual": self.ch_residual, "ch_dilation": self.ch_dilation, "kernel_width": self.kernel_width, "component_ids": [ "conv_1x1", "add_skip", "add_residual", "conv_dilation", "multiply_gate", ], "components": { "conv_1x1": self.conv_1x1.hyperparameters, "add_skip": self.add_skip.hyperparameters, "add_residual": self.add_residual.hyperparameters, "conv_dilation": self.conv_dilation.hyperparameters, "multiply_gate": self.multiply_gate.hyperparameters, }, } @property def derived_variables(self): dv = { "conv_1x1_out": None, "conv_dilation_out": None, "multiply_gate_out": None, "components": { "conv_1x1": self.conv_1x1.derived_variables, "add_skip": self.add_skip.derived_variables, "add_residual": self.add_residual.derived_variables, "conv_dilation": self.conv_dilation.derived_variables, "multiply_gate": self.multiply_gate.derived_variables, }, } dv.update(self._dv) return dv @property def gradients(self): return { "components": { "conv_1x1": self.conv_1x1.gradients, "add_skip": self.add_skip.gradients, "add_residual": self.add_residual.gradients, "conv_dilation": self.conv_dilation.gradients, "multiply_gate": self.multiply_gate.gradients, } } def forward(self, X_main, X_skip=None): self.X_main, self.X_skip = X_main, X_skip conv_dilation_out = self.conv_dilation.forward(X_main) tanh_gate = self.tanh.fn(conv_dilation_out) sigm_gate = self.sigm.fn(conv_dilation_out) multiply_gate_out = self.multiply_gate.forward([tanh_gate, sigm_gate]) conv_1x1_out = self.conv_1x1.forward(multiply_gate_out) # if this is the first wavenet block, initialize the "previous" skip # connection sum to 0 self.X_skip = np.zeros_like(conv_1x1_out) if X_skip is None else X_skip Y_skip = self.add_skip.forward([X_skip, conv_1x1_out]) Y_main = self.add_residual.forward([X_main, conv_1x1_out]) self._dv["tanh_out"] = tanh_gate self._dv["sigm_out"] = sigm_gate self._dv["conv_dilation_out"] = conv_dilation_out self._dv["multiply_gate_out"] = multiply_gate_out self._dv["conv_1x1_out"] = conv_1x1_out return Y_main, Y_skip def backward(self, dY_skip, dY_main=None): dX_skip, dConv_1x1_out = self.add_skip.backward(dY_skip) # if this is the last wavenet block, dY_main will be None. if not, # calculate the error contribution from dY_main and add it to the # contribution from the skip path dX_main = np.zeros_like(self.X_main) if dY_main is not None: dX_main, dConv_1x1_main = self.add_residual.backward(dY_main) dConv_1x1_out += dConv_1x1_main dMultiply_out = self.conv_1x1.backward(dConv_1x1_out) dTanh_out, dSigm_out = self.multiply_gate.backward(dMultiply_out) conv_dilation_out = self.derived_variables["conv_dilation_out"] dTanh_in = dTanh_out * self.tanh.grad(conv_dilation_out) dSigm_in = dSigm_out * self.sigm.grad(conv_dilation_out) dDilation_out = dTanh_in + dSigm_in conv_back = self.conv_dilation.backward(dDilation_out) dX_main += conv_back self._dv["dLdTanh"] = dTanh_out self._dv["dLdSigmoid"] = dSigm_out self._dv["dLdConv_1x1"] = dConv_1x1_out self._dv["dLdMultiply"] = dMultiply_out self._dv["dLdConv_dilation"] = dDilation_out return dX_main, dX_skip