def __train(weight_init_std): bn_network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, weight_init_std=weight_init_std, use_batchnorm=True) network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, weight_init_std=weight_init_std) optimizer = SGD(lr=learning_rate) train_acc_list = [] bn_train_acc_list = [] iter_per_epoch = max(train_size / batch_size, 1) epoch_cnt = 0 for i in range(1000000000): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask] for _network in (bn_network, network): grads = _network.gradient(x_batch, t_batch) optimizer.update(_network.params, grads) if i % iter_per_epoch == 0: train_acc = network.accuracy(x_train, t_train) bn_train_acc = bn_network.accuracy(x_train, t_train) train_acc_list.append(train_acc) bn_train_acc_list.append(bn_train_acc) print("epoch:" + str(epoch_cnt) + " | " + str(train_acc) + " - " + str(bn_train_acc)) epoch_cnt += 1 if epoch_cnt >= max_epochs: break return train_acc_list, bn_train_acc_list
def __init__(self, input_dims, layers_info, opts): self.layers_info = layers_info self.num_layers = len(layers_info) self.params = {} self.save_prefix = opts.save_prefix for ix in xrange(len(layers_info)): if ix == 0: input_dim = input_dims else: input_dim = layers_info[ix - 1][1] output_dim = layers_info[ix][1] if layers_info[ix][0] != "batchnorm": layer_object = DenseLayer(input_dim, output_dim, layers_info[ix][2], dropout=layers_info[ix][3]) else: layer_object = BatchNormLayer(input_dim) self.params[layers_info[ix][0] + "_{}".format(ix)] = layer_object.params setattr(self, 'layer_{}'.format(ix), layer_object) self.optimizer = SGD(self.params, 'categorical_cross_entropy', lr=opts.lr, l2_penalty=opts.l2, momentum=opts.momentum)
def bn_2_layer_test(epochs=2, reg=0.0, lr=0.01, momentum=0.7): trainingData, trainingLabels, \ validationData, validationLabels, \ testingData, testingLabels = loadAllData("Datasets/cifar-10-batches-mat/", valsplit=0.20) timestamp = datetime.now().strftime('%Y-%b-%d--%H-%M-%S') network = Model(name="2-layer(NO BN)") network.addLayer(Linear(32*32*3, 50, regularization=reg, initializer="he")) network.addLayer(Relu()) network.addLayer(Linear(50,10, regularization=reg, initializer="he")) network.addLayer(Softmax()) sgd = SGD(lr=lr, lr_decay=1.00, momentum=momentum, shuffle=True, lr_min=1e-5) network.compile(sgd, "cce") network.fit(trainingData, trainingLabels, epochs=epochs, batch_size=64, validationData=(validationData, validationLabels)) networkBN = Model(name="2-layer(WITH BN)") networkBN.addLayer(Linear(32*32*3, 50, regularization=reg, initializer="he")) networkBN.addLayer(BatchNormalization(50, trainable=True, alpha=0.90)) networkBN.addLayer(Relu()) networkBN.addLayer(Linear(50,10, regularization=reg, initializer="he")) networkBN.addLayer(Softmax()) sgd2 = SGD(lr=lr, lr_decay=1.00, momentum=momentum, shuffle=True, lr_min=1e-5) networkBN.compile(sgd2, "cce") networkBN.fit(trainingData, trainingLabels, epochs=epochs, batch_size=64, validationData=(validationData, validationLabels)) #plotAccuracy(network, "plots/", timestamp) #plotLoss(network, "plots/", timestamp) #loss, acc = network.evaluate(testingData, testingLabels) #print("Test loss: {} , Test acc: {}".format(loss, acc) ) #plotAccuracy(network, "plots/", timestamp, title="2-layer(NO BN) accuracy over epochs", fileName="nobnacc") #plotLoss(network, "plots/", timestamp, title="2-layer(NO BN) loss over epochs", fileName="nobnloss") #plotAccuracy(networkBN, "plots/", timestamp, title="2-layer(WITH BN) accuracy over epochs", fileName="bnacc") #plotLoss(networkBN, "plots/", timestamp, title="2-layer(WITH BN) loss over epochs", fileName="bnloss") multiPlotLoss((network, networkBN), "plots/", timestamp, title="2-layer network loss over epochs, eta:{}, lambda:{}".format(lr, reg)) multiPlotAccuracy((network, networkBN), "plots/", timestamp, title="2-layer network accuracy over epochs, eta:{}, lambda:{}".format(lr, reg))
def compile(self, optimizer=None, loss=None, regularize=None): if optimizer is None: optimizer = SGD() if loss is None: loss = MSE() if regularize is None: # regularize = L2() regularize = lambda parameters: 0 target = self.output.clone() lossValue = loss(self.output, target) + regularize(self.parameters) self.objectiveFunction = optimizer.getFunction(lossValue, self.input, target, self.parameters)
def run_GNN_SGD(train_data, valid_data, W, A, b, B, alpha=0.0001, eps=0.001, n_vector=8, gnn_steps=2, n_epochs=100): params = [] for epoch in range(n_epochs): W, A, b, loss_train = SGD(train_data, n_vector, B, W, A, b, gnn_steps, alpha, eps) precision_train = mean_precision(train_data, W, A, b, n_vector, gnn_steps) precision_val = mean_precision(valid_data, W, A, b, n_vector, gnn_steps) loss_val = valid_loss(data, W, A, b, n_vector, gnn_steps) print( 'epoch: {}, train loss: {}, train precision: {}, valid loss: {}, valid precision: {}' .format(epoch + 1, loss_train, precision_train, loss_val, precision_val)) params.append((loss_train, precision_train, loss_val, precision_val)) return params
def test1layergradients(samples=1, dimensions=3072): print("\n\nTesting 1-layer gradients (NO BN, NO REG) using a batch size of {}".format(samples)) trainingData, trainingLabels, encodedTrainingLabels = loadData("Datasets/cifar-10-batches-mat/data_batch_1.mat") trainingData = trainingData[0:dimensions, 0:samples] trainingLabels = trainingLabels[0:dimensions, 0:samples] encodedTrainingLabels = encodedTrainingLabels[0:dimensions, 0:samples] network = Model() linear = Linear(dimensions, 10, regularization=0.00) network.addLayer(linear) network.addLayer(Softmax()) sgd = SGD(lr=0.001, lr_decay=1.0, momentum=0.0, shuffle=True) network.compile(sgd, "cce") network.predict(trainingData) network.backpropagate(encodedTrainingLabels) timestamp = datetime.now().strftime('%Y-%b-%d--%H-%M-%S') numerical_gradW = compute_grads(1e-6, linear.W, trainingData, encodedTrainingLabels, network) numerical_gradb = compute_grads(1e-6, linear.b, trainingData, encodedTrainingLabels, network) print("W") relative_errorW = grad_difference(linear.gradW, numerical_gradW) print("b") relative_errorb = grad_difference(linear.gradb, numerical_gradb) return (relative_errorW, linear.gradW, numerical_gradW), (relative_errorb, linear.gradb, numerical_gradb)
def test3layergradients(samples=1, dimensions=3072): print("\n\nTesting 3-layer gradients using a batch size of {}".format(samples)) trainingData, trainingLabels, encodedTrainingLabels = loadData("Datasets/cifar-10-batches-mat/data_batch_1.mat") trainingData = trainingData[0:dimensions, 0:samples] trainingLabels = trainingLabels[0:dimensions, 0:samples] encodedTrainingLabels = encodedTrainingLabels[0:dimensions, 0:samples] network = Model() linear = Linear(dimensions, 50, regularization=0.00, initializer="he") network.addLayer(linear) network.addLayer(Relu()) linear2 = Linear(50, 30, regularization=0.00, initializer="he") network.addLayer(linear2) network.addLayer(Relu()) linear3 = Linear(30, 10, regularization=0.00, initializer="he") network.addLayer(linear3) network.addLayer(Softmax()) sgd = SGD(lr=0.001, lr_decay=1.0, momentum=0.0, shuffle=True) network.compile(sgd, "cce") network.predict(trainingData, updateInternal=True) network.backpropagate(encodedTrainingLabels) timestamp = datetime.now().strftime('%Y-%b-%d--%H-%M-%S') numerical_gradW1 = compute_grads_w_BN(1e-4, linear.W, trainingData, encodedTrainingLabels, network) numerical_gradb1 = compute_grads_w_BN(1e-4, linear.b, trainingData, encodedTrainingLabels, network) numerical_gradW2 = compute_grads_w_BN(1e-4, linear2.W, trainingData, encodedTrainingLabels, network) numerical_gradb2 = compute_grads_w_BN(1e-4, linear2.b, trainingData, encodedTrainingLabels, network) numerical_gradW3 = compute_grads_w_BN(1e-4, linear3.W, trainingData, encodedTrainingLabels, network) numerical_gradb3 = compute_grads_w_BN(1e-4, linear3.b, trainingData, encodedTrainingLabels, network) print("W1") relative_errorW = grad_difference(linear.gradW, numerical_gradW1) print("b1") relative_errorb = grad_difference(linear.gradb, numerical_gradb1) print("W2") relative_errorW2 = grad_difference(linear2.gradW, numerical_gradW2) print("b2") relative_errorb2 = grad_difference(linear2.gradb, numerical_gradb2) print("W3") relative_errorW3 = grad_difference(linear3.gradW, numerical_gradW3) print("b3") relative_errorb3 = grad_difference(linear3.gradb, numerical_gradb3) print("\n")
def __train(lr, weight_decay, epocs=50, verbose=False): # 减少epoch的数量 network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10, weight_decay_lambda=weight_decay) optimizer = SGD(lr) iter_per_epoch = max(train_size / mini_batch_size, 1) current_iter = 0 current_epoch = 0 train_loss_list = [] train_acc_list = [] val_acc_list = [] for i in range(int(epochs * iter_per_epoch)): batch_mask = np.random.choice(train_size, mini_batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask] grads = network.gradient(x_batch, t_batch) optimizer.update(network.params, grads) loss = network.loss(x_batch, t_batch) train_loss_list.append(loss) if verbose: print("train loss:" + str(loss)) if current_iter % iter_per_epoch == 0: current_epoch += 1 train_acc = network.accuracy(x_train, t_train) val_acc = network.accuracy(x_val, t_val) train_acc_list.append(train_acc) val_acc_list.append(val_acc) if verbose: print("=== epoch:" + str(current_epoch) + ", train acc:" + str(train_acc) + ", validation acc:" + str(val_acc) + " ===") current_iter += 1 return val_acc_list, train_acc_list
def compile(self, optimizer="SGD", loss="cce"): if type(optimizer) is str: if optimizer == "SGD": self.optimizer = SGD() else: raise NameError("Unrecognized optimizer") else: self.optimizer = copy.deepcopy(optimizer) # Adds reference for the optimizer to the model self.optimizer.model = self if loss == "cce" or loss == "categorical_cross_entropy": self.loss = "categorical_cross_entropy" else: raise NameError("Unrecognized loss function.") self.history = self.optimizer.history
def __call__(self): param = self.param if param is None: opt = SGD() elif isinstance(param, OptimizerBase): opt = param elif isinstance(param, str): opt = self.init_from_str() elif isinstance(param, dict): opt = self.init_from_dict() return opt
def fit(self, loader: DataLoader, optimizer=None, loss_function=None) -> None: """ Fits the model to the data. If no optimizer is passed in, the default optimizer is SGD. If no loss function is passed in, the default loss function is MSE. :returns: None; self.params are fit to the data. """ if optimizer is None: optimizer = SGD(0.01) if loss_function is None: loss_function = mean_squared_error for X, y in loader: if self.params is None: self.params = Matrix([[Variable(random.random())] for _ in range(len(X[0]))]) self.bias = Matrix([[Variable(random.random())]]) output = self._evaluate(X) loss = loss_function(output, y) loss += self._regularize() self.params = optimizer.step(self.params, loss.get_grad(self.params)) self.bias = optimizer.step(self.bias, loss.get_grad(self.bias))
def configure(self, loss, optimizer=SGD(learning_rate=0.01), metrics=None): """ Configure the model for training or evaluation metrics should be a dictionary with the name each of the metric as the key and the function that computes the metric as the value """ self.loss = loss self.optimizer = optimizer self.metrics = metrics if self.metrics is None: self.metrics = {}
def __init__(self, num_layers, units_list=None, initializer=None, optimizer='adam'): self.weight_num = num_layers - 1 # 根据传入的初始化方法初始化参数,本次实验只实现xavier和全0初始化 self.params = xavier(num_layers, units_list) if initializer == 'xavier' else zero( num_layers, units_list) self.optimizer = Adam( weights=self.params, weight_num=self.weight_num) if optimizer == 'adam' else SGD() self.bn_param = {}
def make_DNN(lr=0.1, exp_decay=0.001, optimizer='SGD'): dnn = DNN() dnn.set_input_size(2) dnn.add_layer(4, tanh_activation) dnn.add_layer(3, tanh_activation) dnn.add_layer(1, linear_activation) if optimizer == 'SGD': dnn.compile(loss=mse_loss, optimizer=SGD(lr=lr, exp_decay=exp_decay)) elif optimizer == 'Ada': dnn.compile(loss=mse_loss, optimizer=AdaGrad(lr=lr, exp_decay=exp_decay)) elif optimizer == 'BFGS': dnn.compile(loss=mse_loss, optimizer=BFGS) return dnn
def train(model, train_dataset, test_dataset): (x_train, y_train) = train_dataset (x_test, y_test) = test_dataset lr = 0.1 momentum_coef = 0 weight_decay = 0 print(model) opt = SGD(lr=lr, momentum_coef=momentum_coef, weight_decay=weight_decay) print('Optimizer: {} with (lr: {} -- momentum_coef: {} -- weight_decay: {})'. format(opt.__class__.__name__, lr, momentum_coef, weight_decay)) num_of_epochs = 1000 batch_size = 256 val_split = 0.1 print('Validation Split: {} -- BatchSize: {} -- Epochs: {}'.format(val_split, batch_size, num_of_epochs)) print('Training is about the start with epoch: {}, batch_size: {}, validation_split: {}' .format(num_of_epochs, batch_size, val_split)) opt.train(model, x_train, y_train, num_of_epochs=num_of_epochs, batch_size=batch_size, val_split=val_split, verbose=1) print('\nEvaluating with test dataset !..') test_acc, test_loss = model.evaluate(x_test, y_test, return_pred=False) train_acc, train_loss = model.evaluate(x_train, y_train, return_pred=False) print("train_acc: {} -- test_loss: {}".format(train_acc, train_loss)) print("test_acc: {} -- test_loss: {}".format(test_acc, test_loss)) print('For complete use case of the framework please refer to guide.ipynb')
def init_from_str(self): r = r"([a-zA-Z]*)=([^,)]*)" opt_str = self.param.lower() kwargs = dict([(i, eval(j)) for (i, j) in re.findall(r, opt_str)]) if "sgd" in opt_str: optimizer = SGD(**kwargs) elif "adagrad" in opt_str: optimizer = AdaGrad(**kwargs) elif "rmsprop" in opt_str: optimizer = RMSProp(**kwargs) elif "adam" in opt_str: optimizer = Adam(**kwargs) else: raise NotImplementedError("{}".format(opt_str)) return optimizer
def regularizationSearch(): trainingData, trainingLabels, \ validationData, validationLabels, \ testingData, testingLabels = loadAllData("Datasets/cifar-10-batches-mat/", valsplit=0.10) bestLambda = 0.0 bestValAcc = 0.0 bestLoss = 0.0 for lambdaValue in np.arange(0, 0.2, 0.005): network = Model() network.addLayer(Linear(32*32*3, 50, regularization=lambdaValue, initializer="he")) network.addLayer(BatchNormalization(50, trainable=True)) network.addLayer(Relu()) network.addLayer(Linear(50, 30, regularization=lambdaValue, initializer="he")) network.addLayer(BatchNormalization(30, trainable=True)) network.addLayer(Relu()) network.addLayer(Linear(30,10, regularization=lambdaValue, initializer="he")) network.addLayer(Softmax()) sgd = SGD(lr=0.01, lr_decay=0.95, momentum=0.7, shuffle=True, lr_min=1e-5) network.compile(sgd, "cce") timestamp = datetime.now().strftime('%Y-%b-%d--%H-%M-%S') network.fit(trainingData, trainingLabels, epochs=20, validationData=(validationData, validationLabels), batch_size=64) #plotAccuracy(network, "plots/", timestamp) #plotLoss(network, "plots/", timestamp) print("Lambda:{}".format(lambdaValue)) loss, acc = network.evaluate(validationData, validationLabels) print("Val loss: {} , Val acc: {}".format(loss, acc) ) print("\n\n") if acc > bestValAcc: bestLambda = lambdaValue bestValAcc = acc bestLoss = loss return bestLambda, bestValAcc, bestLoss
def init_from_dict(self): O = self.param cc = O["cache"] if "cache" in O else None op = O["hyperparameters"] if "hyperparameters" in O else None if op is None: raise ValueError("Must have `hyperparemeters` key: {}".format(O)) if op and op["id"] == "SGD": optimizer = SGD().set_params(op, cc) elif op and op["id"] == "RMSProp": optimizer = RMSProp().set_params(op, cc) elif op and op["id"] == "AdaGrad": optimizer = AdaGrad().set_params(op, cc) elif op and op["id"] == "Adam": optimizer = Adam().set_params(op, cc) elif op: raise NotImplementedError("{}".format(op["id"])) return optimizer
def train( net: NeuralNet, inputs: Tensor, targets: Tensor, num_epochs: int = 5000, iterator: DataIterator = BatchIterator(), loss=MSE(), optimizer: Optimizer = SGD() ) -> None: for epoch in range(num_epochs): epoch_loss = 0.0 for batch in iterator(inputs, targets): predicted = net.forward(batch.inputs) epoch_loss += loss.loss(predicted, batch.targets) grad = loss.grad(predicted, batch.targets) net.backward(grad) optimizer.step(net) print(epoch, epoch_loss)
def train(): global args args = parser.parse_args() print(args) train_videos = UCF101Flows( frames_path='data/UCF101/train/frames/', batch_size=args.batch_size) valid_videos = UCF101Flows( frames_path='data/UCF101/validation/frames', batch_size=args.batch_size, shuffle=False) lr_scheduler = LearningRateScheduler(schedule=schedule) save_best = ModelCheckpoint( args.filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks = [save_best, lr_scheduler] lr_multipliers = {} lr_multipliers['block1_conv1/kernel:0'] = 10 if os.path.exists(args.filepath): model = load_model(args.filepath) else: model = TSNs_MotionStream( input_shape=(299, 299, 20), dropout_prob=0.7, classes=len(train_videos.labels)) model.compile(optimizer=SGD(lr=args.train_lr, momentum=0.9, multipliers=lr_multipliers), loss='categorical_crossentropy', metrics=['acc']) model.fit_generator( generator=train_videos, epochs=args.epochs, callbacks=callbacks, workers=args.num_workers, validation_data=valid_videos)
def experiment(method, kappa, epochs, batch_size, lr, momentum, zero_init): train_loader, test_loader = utils.load(batch_size=batch_size) model = MLPNet(zero_init=zero_init) if method == 'SGD': optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) elif method == 'PSGDl1': optimizer = PSGDl1(model.parameters(), kappa_l1=kappa, lr=lr, momentum=momentum) elif method == 'SGDFWl1': optimizer = SGDFWl1(model.parameters(), kappa_l1=kappa) else: raise ValueError('Invalid choice of method: ' + str(method)) criterion = nn.CrossEntropyLoss(size_average=False) model, metrics = train_model(model, optimizer, criterion, epochs, train_loader, test_loader) fname = get_exp_name(method, kappa, epochs, batch_size, zero_init) with open('results/' + fname + '.pkl', 'wb+') as f: pickle.dump(metrics, f)
def create_optimizer(lr): if name == 'SGD': return SGD(model_params.values(), lr, \ momentum=momentum, weight_decay=weight_decay) elif name == 'NoisySGD': noise_factor = other_params['noise_factor'] return NoisySGD(model_params.values(), lr, noise_factor, momentum, weight_decay) elif name == 'ReservoirSGD': scale = other_params['scale'] is_distributed = other_params['distributed'] max_reservoir_size = other_params['max_reservoir_size'] num_gradients_to_sample = other_params['num_gradients_to_sample'] # TODO make not distributed version of this? maybe if not is_distributed: raise ValueError( 'ReservoirSGD only supports distributed mode right now!') return ReservoirSGD(model_params.values(), lr, scale, num_gradients_to_sample, max_reservoir_size, momentum, weight_decay) elif name == 'HessianVecSGD': noise_factor = other_params['noise_factor'] return HessianVecSGD(model_params.values(), lr, noise_factor, momentum, weight_decay)
def main(): config = { "optimizer": "rnn", "problem": "mnist", "rollout_length": 100, # This is 100 in the paper "learning_rate": 0.1, "decay_rate": 0.9, "meta_layers": 2, "meta_hidden_size": 20, "layers": 2, "hidden_size": 100, "activation": 'relu', "preprocess": True, "max_to_keep": 3, "retrain": False, "dim": 10, "range_of_means": 10, "range_of_stds": 10, "summary_dir": "summary", "checkpoint_dir": "data_ckpt", "batch_size": 10000, "training_iters": 4000, "log_iters": 100 } # create the experiments dirs create_dirs([config["summary_dir"], config["checkpoint_dir"]]) # create tensorflow session sess = tf.Session() # create your data generator # create an instance of the model you want if config["problem"] == "simple": data = SimpleDG(config) model = LinearRegressionModel(config) elif config["problem"] == "mnist": data = MNISTDG(config) model = MNISTModel(config) else: raise ValueError("{} is not a valid problem".format(config["problem"])) # create tensorboard logger # logger = Logger(sess, config) # create trainer and pass all the previous components to it # trainer = LinearRegressionTrainer(sess, model, data, config, logger) sess.run(tf.global_variables_initializer()) if config["optimizer"] == "sgd": optim = SGD(config) losses = learn(optim, model, config["rollout_length"]) elif config["optimizer"] == "rms": optim = RMSprop(config) losses = learn(optim, model, config["rollout_length"]) elif config["optimizer"] == "rnn": optim = RNNOptimizer(config) losses = learn(optim, model, config["rollout_length"]) if config["retrain"]: optim.train(losses, sess, data) else: optim.load(sess) else: raise ValueError("{} is not a valid optimizer".format( config["optimizer"])) # initialize variables in optimizee # (can't initialize all here because it would potentially overwrite the trained optimizer) sess.run( tf.variables_initializer([ var for var in tf.trainable_variables(scope=optim.__class__.__name__) ])) x = np.arange(config["rollout_length"] + 1) for i in range(3): sess.run( tf.variables_initializer([ var for var in tf.trainable_variables( scope=optim.__class__.__name__) ])) data.refresh_parameters(seed=i) data_x, data_y = next(data.next_batch(config["batch_size"])) l = sess.run([losses], feed_dict={ "input:0": data_x, "label:0": data_y }) print(l) p1, = plt.semilogy(x, l[0], label=config["optimizer"]) plt.legend(handles=[p1]) plt.title('Losses') plt.show() # TODO compare different optimizers data.refresh_parameters() data_x, data_y = next(data.next_batch(100, mode="train")) pred = sess.run(model.prediction, feed_dict={ "input:0": data_x, "label:0": data_y }) print( list( zip(pred, np.argmax(data_y, axis=1), pred == np.argmax(data_y, axis=1)))) # calculate accuracy on test data seed = np.random.randint(low=0, high=1e6) data.refresh_parameters(seed=seed) data_x, data_y = next(data.next_batch(5000, mode="train")) acc = sess.run(model.accuracy, feed_dict={ "input:0": data_x, "label:0": data_y }) print("Train accuracy: {}".format(acc)) data_x, data_y = next(data.next_batch(5000, mode="test")) acc = sess.run(model.accuracy, feed_dict={ "input:0": data_x, "label:0": data_y }) print("Test accuracy: {}".format(acc))
xs = corpus[:-1] # 入力 ts = corpus[1:] # 教師 data_size = len(xs) print('corpus size: {0}, vocabulary size: {1}'.format( corpus_size, vocab_size)) # max_iters = data_size // (batch_size * time_size) time_idx = 0 total_loss = 0 loss_count = 0 ppl_list = [] # モデル model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) # ミニバッチの各サンプル読込開始位置を計算 jump = (corpus_size - 1) // batch_size offsets = [i * jump for i in range(batch_size)] for epoch in range(max_epoch): for iter in range(max_iters): # ミニバッチ取得 batch_x = np.empty((batch_size, time_size), dtype='i') batch_t = np.empty((batch_size, time_size), dtype='i') for t in range(time_size): for i, offset in enumerate(offsets): batch_x[i, t] = xs[(offset + time_idx) % data_size] batch_t[i, t] = ts[(offset + time_idx) % data_size]
def main(): trainingData, trainingLabels, \ validationData, validationLabels, \ testingData, testingLabels = loadAllData("Datasets/cifar-10-batches-mat/", valsplit=.10) #Settings 1 #reg = 0.065 #lr = 0.002 #Settings 2 #reg = 0.0021162 #lr = 0.061474 #Settings 3 #reg = 0.0010781 #lr = 0.069686 #Settings 4 #reg = 0.0049132 #lr = 0.07112 #Settings 5 reg = 0.005 lr = 0.007 network = Model() network.addLayer( Linear(32 * 32 * 3, 50, regularization=reg, initializer="he")) network.addLayer(BatchNormalization(50, trainable=True)) network.addLayer(Relu()) network.addLayer(Linear(50, 30, regularization=reg, initializer="he")) network.addLayer(BatchNormalization(30, trainable=True)) network.addLayer(Relu()) network.addLayer(Linear(30, 10, regularization=reg, initializer="he")) network.addLayer(Softmax()) sgd = SGD(lr=lr, lr_decay=0.95, momentum=0.7, shuffle=True, lr_min=1e-5) network.compile(sgd, "cce") timestamp = datetime.now().strftime('%Y-%b-%d--%H-%M-%S') network.fit(trainingData, trainingLabels, epochs=30, batch_size=100, validationData=(validationData, validationLabels)) plotAccuracy( network, "plots/", timestamp, title="3-layer network accuracy over epochs, eta:{}, lambda:{}".format( lr, reg)) plotLoss( network, "plots/", timestamp, title="3-layer network loss over epochs, eta:{}, lambda:{}".format( lr, reg)) loss, acc = network.evaluate(testingData, testingLabels) print("Test loss: {} , Test acc: {}".format(loss, acc))
#!/usr/bin/env python3 import numpy as np from lib.mnist import load_mnist from chap6.multi_layer_net import MultiLayerNet from chap6.overfit_weight_decay import train from optimizers import SGD (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True) x_train = x_train[:300] t_train = t_train[:300] optimizer = SGD(lr=0.01) max_epochs = 201 train_size = x_train.shape[0] batch_size = 100 # train network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10) (train_loss_list, train_acc_list, test_acc_list) = train(network) network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10, use_dropout=True, dropout_ratio=0.2) (train_loss_list_decay, train_acc_list_decay, test_acc_list_decay) = train(network) # draw out
from optimizers import SGD with open('data/shakespear.txt', 'r') as f: raw = f.read() vocab = list(set(raw)) word2index = {} for i, word in enumerate(vocab): word2index[word] = i indices = np.array(list(map(lambda x: word2index[x], raw))) embed = Embedding(vocab_size=len(vocab), dim=512) model = RNNCell(n_inputs=512, n_hidden=512, n_output=len(vocab)) criterion = CrossEntropyLoss() optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.01) batch_size = 32 bptt = 16 n_batches = int((indices.shape[0] / batch_size)) trimmed_indices = indices[:n_batches * batch_size] # batch_indices: each column represents a sub-sequence from indices -> continuous batched_indices = trimmed_indices.reshape(batch_size, n_batches) batched_indices = batched_indices.transpose() input_batched_indices = batched_indices[:-1] target_batched_indices = batched_indices[1:] n_bptt = int((n_batches - 1) / bptt) input_batches = input_batched_indices[:n_bptt * bptt] input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
def network_grad_test(): # Load data if args.data_set == 'SwissRollData': Xtrain, Xtest, Ytrain, Ytest = loadSwissRollData() elif args.data_set == 'GMMData': Xtrain, Xtest, Ytrain, Ytest = loadGMMData() else: Xtrain, Xtest, Ytrain, Ytest = loadPeaksData() # preprocess data - shuffle and split into different batch sizes (using batch_size list) Xtrain, Ytrain, test_sets, train_sets = preprocess_data( Xtest, Xtrain, Ytest, Ytrain) # hyper params n_layer = args.n_layers neurons = args.neurons dim_in = Xtrain.shape[0] dim_out = Ytrain.shape[0] lr = args.lr opt = SGD(lr=lr) # init model model = Net(n_layer, dim_in, dim_out, opt, neurons) all_batches, all_labels = train_sets batch = all_batches[0] labels = all_labels[0].T outputs = model(batch, labels) f_x = model.cross_entropy(outputs, labels) # compute f(x) # get the d vectors to perturb the weights d_vecs = get_pertrubazia(model) model.backward() # compute grad(x) per layer # concatenate grad per layer grad_x = get_raveled_grads_per_layer(model) # save weights of each layer, for testing: weights_list = get_weights_from_layers(model) # save the initial weights w_ce = model.cross_entropy.W w_li = model.linear_inp.W first_order_l, second_order_l = [], [] eps_vals = np.geomspace(0.5, 0.5**20, 20) for eps in eps_vals: eps_d = eps * d_vecs[0] eps_ds = [eps_d.ravel()] model.linear_inp.W = np.add(w_li, eps_d) for d, ll, w in zip(d_vecs[1:-1], model.layers, weights_list[1:-1]): eps_d = eps * d ll.W = np.add(w, eps_d) eps_ds.append(eps_d.ravel()) eps_d = eps * d_vecs[-1] model.cross_entropy.W = np.add(w_ce, eps_d) eps_ds.append(eps_d.ravel()) eps_ds = np.concatenate(eps_ds, axis=0) output_d = model(batch, labels) fx_d = model.cross_entropy(output_d, labels) first_order = abs(fx_d - f_x) second_order = abs(fx_d - f_x - eps_ds.ravel().T @ grad_x.ravel()) print(first_order) print(second_order) first_order_l.append(first_order) second_order_l.append(second_order) l = range(20) plt.title('Network gradient test') plt.plot(l, first_order_l, label='First Order') plt.plot(l, second_order_l, label='Second Order') plt.yscale('log') plt.legend() plt.savefig('./Test_Figures/grad_test_net.png', transparent=True, bbox_inches='tight', pad_inches=0) plt.show()
class Model(): def __init__(self, name="Model"): self.layers = [] self.name = name self.loss = None self.optimizer = None # Method for adding a layer def addLayer(self, layer): self.layers.append(layer) # Performs the forward pass and evaluates the network # Returns the loss value & metrics values def evaluate(self, inputs, targets, updateInternal=False): predictions = self.predict(inputs, updateInternal) cost = self.computeCost(predictions, targets) accuracy = self.computeAccuracy(predictions, targets) return cost, accuracy # Performs a forward pass without training the network def predict(self, inputs, updateInternal=False): prediction = inputs for layer in self.layers: if type(layer) != BatchNormalization: prediction = layer.forward(prediction) else: prediction = layer.forward(prediction, updateInternal) return prediction # Propagates the targets(one hot encoding) back through the network def backpropagate(self, targets): grad = self.layers[-1].backward(targets) for layer in self.layers[-2::-1]: grad = layer.backward(grad) return grad # Computes the cost def computeCost(self, predictions, targets): totaltCost = 0 ## Maybe dont need to use the probabilities. We have the predictions... if self.loss == "categorical_cross_entropy": assert self.layers[-1].type == "Softmax", "Loss is cross-entropy but last layer is not softmax" yhat = targets*np.log(self.layers[-1].probabilities) entropy = -np.sum(yhat)/targets.shape[1] totaltCost = totaltCost + entropy for layer in self.layers[0:-1]: totaltCost = totaltCost + layer.cost() # NOT TESTED YET elif self.loss == "binary_cross_entropy": m = predictions.shape[0] binaryEntropy = -1 / m * (np.dot(targets, np.log(predictions).T) + np.dot(1 - targets, np.log(1 - predictions).T)) totaltCost = totaltCost + np.squeeze(binaryEntropy) for layer in self.layers[0:-1]: totaltCost = totaltCost + layer.cost() # NOT TESTED YET elif self.loss == "mse": totaltCost = totaltCost + np.mean((predictions-targets)**2) for layer in self.layers[0:-1]: totaltCost = totaltCost + layer.cost() elif self.loss == "None": for layer in self.layers: totaltCost = totaltCost + layer.cost() return totaltCost # Computes the accuracy of the predictions given the targets def computeAccuracy(self, predictions, targets): assert predictions.shape == targets.shape accuracy = np.sum(np.argmax(predictions, axis=0) == np.argmax(targets, axis=0)) / predictions.shape[1] return accuracy # Initializes the attributes for the optimizer and the loss function. # Also adds a reference for the optimizer to the current model(for access to the forward and backward pass of the network) def compile(self, optimizer="SGD", loss="cce"): if type(optimizer) is str: if optimizer == "SGD": self.optimizer = SGD() else: raise NameError("Unrecognized optimizer") else: self.optimizer = copy.deepcopy(optimizer) # Adds reference for the optimizer to the model self.optimizer.model = self if loss == "cce" or loss == "categorical_cross_entropy": self.loss = "categorical_cross_entropy" else: raise NameError("Unrecognized loss function.") self.history = self.optimizer.history # Fits the model to the data using the optimizer and loss function specified during compile def fit(self, inputs, targets, epochs=1, validationData=None, batch_size=None, verbose=True): if self.loss is None or self.optimizer is None: raise ValueError("Model not compiled") self.optimizer.train(x_train=inputs, y_train=targets,\ validationData=validationData,\ epochs=epochs, batch_size=batch_size, verbose=verbose) def __str__(self): strrep = "Sequential Model: " + self.name +"\n" for i in range(len(self.layers)): strrep = strrep + " Layer " + str(i) + ": Type:" + " " + str(self.layers[i]) + "\n" return strrep
# D_in is input dimension # D_out is output dimension. N, D_in, D_out = 64, 1, 1 # Add some noise to the observations noise_var = 0.5 # Create random input and output data X = lhs(D_in, N) y = 5 * X + noise_var * np.random.randn(N, D_out) # Define the model model = LinearRegression(X, y) # Define an optimizer optimizer = SGD(model.num_params, lr=1e-3, momentum=0.9) # optimizer = Adam(model.num_params, lr = 1e-3) # optimizer = RMSprop(model.num_params, lr = 1e-3) # Train the model model.train(10000, optimizer) # Print the learned parameters print('w = %e, sigma_sq = %e' % (model.theta[:-1], np.exp(model.theta[-1]))) # Make predictions y_pred = model.predict(X) # Plot plt.figure(1)
def sgd_test(): # Load data if args.data_set == 'SwissRollData': Xtrain, Xtest, Ytrain, Ytest = loadSwissRollData() elif args.data_set == 'GMMData': Xtrain, Xtest, Ytrain, Ytest = loadGMMData() else: Xtrain, Xtest, Ytrain, Ytest = loadPeaksData() # Define set of learning rate and batch size (use only for testing) batch_size = np.geomspace(2, 2**8, 8) batch_size = [round_(i) for i in batch_size] # preprocess data - shuffle and split into different batch sizes (using batch_size list) Xtrain, Ytrain, test_sets, train_sets = preprocess_data( Xtest, Xtrain, Ytest, Ytrain) # train loop all_batches, all_labels = train_sets softmax = Softmax(Xtrain.shape[0] + 1, Ytrain.shape[0]) loss_func = CrossEntropy(softmax.W) opt = SGD(lr=args.lr) accs_hyper_params_train = [] accs_hyper_params_test = [] for e in range(args.iter): acc_train = [] loss_l = [] for batch, labels in tqdm(zip(all_batches, all_labels), total=len(all_batches), file=sys.stdout): labels = labels.T ones = np.ones((1, batch.shape[-1]), dtype=int) batch = np.concatenate((batch, ones), axis=0) loss = loss_func(batch, labels) loss_l.append(loss) loss_func.grad_w(batch, labels) softmax.W = opt.step(loss_func.grad_W, softmax.W) loss_func.W = softmax.W output = softmax(batch) # calculate train error labels = get_index(labels) prediction = predict(output) acc_train = np.append(acc_train, prediction == labels, axis=0) print('Epoch {} train acc: {} train loss: {}'.format( e, np.mean(acc_train), np.mean(loss_l))) accs_hyper_params_train.append(np.mean(acc_train)) accs_hyper_params_test.append( np.mean(test_accuracy(softmax, test_sets))) plt.plot(range(args.iter), accs_hyper_params_train, label='Train Accuracy') plt.plot(range(args.iter), accs_hyper_params_test, label='Validation Accuracy') plt.title('SGD test: {} Set, Acc of lr={} and batch size={}'.format( args.data_set, args.lr, args.batch_size)) plt.legend() plt.savefig( './Test_Figures/{} Set, Acc of lr={} and batch size={}.png'.format( args.data_set, args.lr, args.batch_size), transparent=True, bbox_inches='tight', pad_inches=0) plt.show()