def main(): # setup the model and run for num_epochs saving the last state only # this is at the top so that the be is generated mlp = gen_model(args.backend) # setup data iterators (X_train, y_train), (X_test, y_test), nclass = load_mnist(path=args.data_dir) if args.backend == 'nervanacpu' or args.backend == 'cpu': # limit data since cpu backend runs slower train = DataIterator(X_train[:1000], y_train[:1000], nclass=nclass, lshape=(1, 28, 28)) valid = DataIterator(X_test[:1000], y_test[:1000], nclass=nclass, lshape=(1, 28, 28)) else: train = DataIterator(X_train, y_train, nclass=nclass, lshape=(1, 28, 28)) valid = DataIterator(X_test, y_test, nclass=nclass, lshape=(1, 28, 28)) # serialization related cost = GeneralizedCost(costfunc=CrossEntropyBinary()) opt_gdm = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) checkpoint_model_path = os.path.join('./', 'test_oneshot.pkl') checkpoint_schedule = 1 # save at every step callbacks = Callbacks(mlp, train) callbacks.add_serialize_callback(checkpoint_schedule, checkpoint_model_path, history=2) # run the fit all the way through saving a checkpoint e mlp.fit(train, optimizer=opt_gdm, num_epochs=num_epochs, cost=cost, callbacks=callbacks) # setup model with same random seed run epoch by epoch # serializing and deserializing at each step mlp = gen_model(args.backend) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) opt_gdm = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) # reset data iterators train.reset() valid.reset() checkpoint_model_path = os.path.join('./', 'test_manyshot.pkl') checkpoint_schedule = 1 # save at evey step callbacks = Callbacks(mlp, train) callbacks.add_serialize_callback(checkpoint_schedule, checkpoint_model_path, history=num_epochs) for epoch in range(num_epochs): # _0 points to state at end of epoch 0 mlp.fit(train, optimizer=opt_gdm, num_epochs=epoch+1, cost=cost, callbacks=callbacks) # load saved file prts = os.path.splitext(checkpoint_model_path) fn = prts[0] + '_%d' % epoch + prts[1] mlp.load_weights(fn) # load the saved weights # compare test_oneshot_<num_epochs>.pkl to test_manyshot_<num_epochs>.pkl try: compare_model_pickles('test_oneshot_%d.pkl' % (num_epochs-1), 'test_manyshot_%d.pkl' % (num_epochs-1)) except: print 'test failed....' sys.exit(1)
def main(args): # load up the mnist data set dataset = MNIST(path=args.data_dir) # initialize model object mlp = Model(layers=[ Affine(nout=100, init=Gaussian(loc=0.0, scale=0.01), activation=Rectlin()), Affine(nout=10, init=Gaussian(loc=0.0, scale=0.01), activation=Logistic(shortcut=True)) ]) # setup optimizer optimizer = GradientDescentMomentum(0.1, momentum_coef=0.9, stochastic_round=args.rounding) # configure callbacks callbacks = Callbacks(mlp, eval_set=dataset.valid_iter, **args.callback_args) # run fit # setup cost function as CrossEntropy mlp.fit(dataset.train_iter, optimizer=optimizer, num_epochs=args.epochs, cost=GeneralizedCost(costfunc=CrossEntropyBinary()), callbacks=callbacks) error_rate = mlp.eval(dataset.valid_iter, metric=Misclassification()) neon_logger.display('Classification accuracy = %.4f' % (1 - error_rate))
def build_model(self): # setup weight initialization function init_norm = Gaussian(loc=0.0, scale=0.01) # setup model layers layers = [ Affine(nout=100, init=init_norm, bias=Uniform(), activation=Rectlin()), Affine(nout=10, init=init_norm, bias=Uniform(), activation=Logistic(shortcut=True)) ] # setup cost function as CrossEntropy self.cost = GeneralizedCost(costfunc=CrossEntropyBinary()) # setup optimizer self.optimizer = GradientDescentMomentum( 0.1, momentum_coef=0.9, stochastic_round=self.args.rounding) # initialize model object self.model = ModelDist(layers=layers)
def test_gdm_nesterov(backend_default): lrate, mom, wdecay = 0.1, 0.9, 0.005 gdm = GradientDescentMomentum(learning_rate=lrate, momentum_coef=mom, wdecay=wdecay, nesterov=True) data_shape = (200, 128) # params to be updated using GDM np_param = np.random.rand(*data_shape) param = wrap(np_param) # Optimizer states velocity = 0.01 * np.random.rand(*data_shape) states = [wrap(velocity)] # Check a few iterations in a row for ii in range(20): # Choose a gradient np_grad = 0.01 * np.random.rand(*data_shape) grad = wrap(np_grad) # Update manually np_grad = np_grad / data_shape[1] velocity[:] = mom * velocity - lrate * (np_grad + wdecay * np_param) np_param[:] = np_param + mom * velocity - lrate * (np_grad + wdecay * np_param) param_list = [((param, grad), states)] compare_tensors(gdm, param_list, np_param, tol=1e-6)
def train(self, dataset, model=None): """Trains the passed model on the given dataset. If no model is passed, `generate_default_model` is used.""" print "[%s] Starting training..." % self.model_name start = time.time() # The training will be run on the CPU. If a GPU is available it should be used instead. backend = gen_backend(backend='cpu', batch_size=self.batch_size, rng_seed=self.random_seed, stochastic_round=False) cost = GeneralizedCost( name='cost', costfunc=CrossEntropyMulti()) optimizer = GradientDescentMomentum( learning_rate=self.lrate, momentum_coef=0.9) # set up the model and experiment if not model: model = self.generate_default_model(dataset.num_labels) args = NeonCallbackParameters() args.output_file = os.path.join(self.root_path, self.Callback_Store_Filename) args.evaluation_freq = 1 args.progress_bar = False args.epochs = self.max_epochs args.save_path = os.path.join(self.root_path, self.Intermediate_Model_Filename) args.serialize = 1 args.history = 100 args.model_file = None callbacks = Callbacks(model, dataset.train(), args, eval_set=dataset.test()) # add a callback that saves the best model state callbacks.add_save_best_state_callback(self.model_path) # Uncomment line below to run on GPU using cudanet backend # backend = gen_backend(rng_seed=0, gpu='cudanet') model.fit( dataset.train(), optimizer=optimizer, num_epochs=self.max_epochs, cost=cost, callbacks=callbacks) print("[%s] Misclassification error = %.1f%%" % (self.model_name, model.eval(dataset.test(), metric=Misclassification()) * 100)) print "[%s] Finished training!" % self.model_name end = time.time() print "[%s] Duration in seconds", end - start return model
def __init__(self, rounding, callback_args, epochs): # setup weight initialization function self.init = Gaussian(loc=0.0, scale=0.01) # setup optimizer self.optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9, stochastic_round=rounding) # setup cost function as CrossEntropy self.cost = GeneralizedCost(costfunc=SumSquared()) self.epochs = epochs self.model = None self.callback_args = callback_args
def test_gdm(backend): lrate, mom, wdecay = 0.1, 0.9, 0.005 gdm = GradientDescentMomentum( learning_rate=lrate, momentum_coef=mom, wdecay=wdecay) param = np.random.rand(200, 128) param2 = copy.deepcopy(param) grad = 0.01 * np.random.rand(200, 128) states = [0.01 * np.random.rand(200, 128)] velocity = states[0] param2[:] = param2 + velocity * mom - grad * lrate - wdecay * lrate * param param_list = [((wrap(param), wrap(grad)), [wrap(states[0])])] compare_tensors(gdm, param_list, param2, tol=1e-7)
def test_multi_optimizer(backend_default): opt_gdm = GradientDescentMomentum(learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) opt = MultiOptimizer({ 'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1 }) map_list = opt._map_optimizers(layer_list) assert map_list[opt_adam][0].__class__.__name__ == 'Convolution' assert map_list[opt_ada][0].__class__.__name__ == 'Bias' assert map_list[opt_rms][0].__class__.__name__ == 'Linear' assert map_list[opt_gdm][0].__class__.__name__ == 'Activation' assert map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def test_gdm_wclip(backend_default): lrate, mom, wdecay, wclip = 0.1, 0.9, 0.005, 0.5 gdm = GradientDescentMomentum( learning_rate=lrate, momentum_coef=mom, wdecay=wdecay, param_clip_value=wclip) param = np.random.rand(200, 128) param2 = copy.deepcopy(param) grad = 0.01 * np.random.rand(200, 128) grad2 = grad / 128. states = [0.01 * np.random.rand(200, 128)] velocity = states[0] param2[:] = param2 + velocity * mom - grad2 * lrate - wdecay * lrate * param np.clip(param2, -wclip, wclip, param2) param_list = [((wrap(param), wrap(grad)), [wrap(states[0])])] compare_tensors(gdm, param_list, param2, tol=1e-7)
def run(be, fake_dilation, fsz, stride, pad, dilation): K = 8 strides = stride padding = pad be.rng = be.gen_rng(be.rng_seed) in_shape = 16 while out_shape(in_shape, fsz, stride, dilation, pad) < 3: in_shape *= 2 train_shape = (1, in_shape, in_shape) inp = be.array(be.rng.randn(np.prod(train_shape), be.bsz)) init = Gaussian() layers = [ Conv((5, 5, K), init=init), Conv((fsz, fsz, K), strides=strides, padding=padding, init=init, dilation=dict(dil_d=1, dil_h=dilation, dil_w=dilation)), Conv((3, 3, K), init=init), Affine(nout=1, init=init) ] model = Model(layers=layers) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) model.initialize(train_shape, cost) if fake_dilation: # Perform regular convolution with an expanded filter. weights = save(model) new_layers = layers # Replace the middle layers. new_fsz = dilated_fsz(fsz, dilation) new_layers[1] = Conv((new_fsz, new_fsz, K), strides=strides, padding=padding, init=init) model = Model(layers=new_layers) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) model.initialize(train_shape, cost) load(weights, model, K, fsz, dilation) print(model) model.optimizer = GradientDescentMomentum(learning_rate=0.01, momentum_coef=0.9) outputs = fprop(model, inp) weights = bprop(model, outputs) model.optimizer.optimize(model.layers_to_optimize, epoch=0) return outputs.get(), weights.get()
def test_multi_optimizer(backend_default_mkl): """ A test for MultiOptimizer. """ opt_gdm = GradientDescentMomentum( learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) for l in layer_list: l.configure(in_obj=(16, 28, 28)) l.allocate() # separate layer_list into two, the last two recurrent layers and the rest layer_list1, layer_list2 = layer_list[:-2], layer_list[-2:] opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Convolution_bias': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1}) layers_to_optimize1 = [l for l in layer_list1 if isinstance(l, ParameterLayer)] layers_to_optimize2 = [l for l in layer_list2 if isinstance(l, ParameterLayer)] opt.optimize(layers_to_optimize1, 0) assert opt.map_list[opt_adam][0].__class__.__name__ is 'Convolution_bias' assert opt.map_list[opt_rms][0].__class__.__name__ == 'Linear' opt.optimize(layers_to_optimize2, 0) assert opt.map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert opt.map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def train_regressor(orig_wordvecs, w2v_W, w2v_vocab): """ Return regressor to map word2vec to RNN word space Function modified from: https://github.com/ryankiros/skip-thoughts/blob/master/training/tools.py """ # Gather all words from word2vec that appear in wordvecs d = defaultdict(lambda: 0) for w in w2v_vocab.keys(): d[w] = 1 shared = OrderedDict() count = 0 for w in list(orig_wordvecs.keys())[:-2]: if d[w] > 0: shared[w] = count count += 1 # Get the vectors for all words in 'shared' w2v = np.zeros((len(shared), 300), dtype='float32') sg = np.zeros((len(shared), 620), dtype='float32') for w in shared.keys(): w2v[shared[w]] = w2v_W[w2v_vocab[w]] sg[shared[w]] = orig_wordvecs[w] train_set = ArrayIterator(X=w2v, y=sg, make_onehot=False) layers = [ Linear(nout=620, init=Gaussian(loc=0.0, scale=0.1)), Bias(init=Constant(0.0)) ] clf = Model(layers=layers) # regression model is trained using default global batch size cost = GeneralizedCost(costfunc=SumSquared()) opt = GradientDescentMomentum(0.1, 0.9, gradient_clip_value=5.0) callbacks = Callbacks(clf) clf.fit(train_set, num_epochs=20, optimizer=opt, cost=cost, callbacks=callbacks) return clf
def run(args, train, test): init_uni = Uniform(low=-0.1, high=0.1) opt_gdm = GradientDescentMomentum(learning_rate=0.01, momentum_coef=0.9, stochastic_round=args.rounding) layers = [Conv((5, 5, 16), init=init_uni, activation=Rectlin(), batch_norm=True), Pooling((2, 2)), Conv((5, 5, 32), init=init_uni, activation=Rectlin(), batch_norm=True), Pooling((2, 2)), Affine(nout=500, init=init_uni, activation=Rectlin(), batch_norm=True), Affine(nout=10, init=init_uni, activation=Softmax())] cost = GeneralizedCost(costfunc=CrossEntropyMulti()) mlp = Model(layers=layers) callbacks = Callbacks(mlp, eval_set=test, **args.callback_args) mlp.fit(train, optimizer=opt_gdm, num_epochs=args.epochs, cost=cost, callbacks=callbacks) err = mlp.eval(test, metric=Misclassification())*100 print('Misclassification error = %.2f%%' % err) return err
def __init__(self, num_epochs, callback_args, optimizer=GradientDescentMomentum(0.07, momentum_coef=0.9)): """ Args: num_epochs(int): number of epochs to train the model **callback_args (dict): callback args keyword arguments to init Callback for the model cost: the model's cost function. Default is 'neon.transforms.CrossEntropyBinary' cost optimizer (:obj:`neon.optimizers`): the model's optimizer. Default is `neon.optimizers.GradientDescentMomentum(0.07, momentum_coef=0.9)` """ self.model = None self.cost = GeneralizedCost(costfunc=CrossEntropyBinary()) self.optimizer = optimizer self.epochs = num_epochs self.callback_args = callback_args
def test_gdm_nesterov(backend_default): lrate, mom, wdecay = 0.1, 0.9, 0.005 gdm = GradientDescentMomentum(learning_rate=lrate, momentum_coef=mom, wdecay=wdecay, nesterov=True) # params to be updated using GDM param = np.random.rand(200, 128) grad = 0.01 * np.random.rand(200, 128) # params to be update manually param2 = copy.deepcopy(param) grad2 = grad / 128. states = [0.01 * np.random.rand(200, 128), 0.01 * np.zeros_like(grad)] velocity = states[0] velocity_backup = states[1] velocity_backup[:] = velocity param2[:] = param2 + (1 + mom) * (velocity * mom - grad2 * lrate - wdecay * lrate * param) - mom * velocity_backup param_list = [((wrap(param), wrap(grad)), [wrap(states[0]), wrap(states[1])])] compare_tensors(gdm, param_list, param2, tol=1e-7)
class Trainer(BaseTrainer): def __init__(self, model, ngpu, options, data_options=None, time_options=None): self.model = model #self.model.set_batch_size(data_options['batch_size']) self.ngpu = ngpu self.gpu_mode = True if ngpu >= 1 else False self.time_options = time_options self.data_options = data_options if self.gpu_mode: try: self.be = gen_backend(backend='nervanagpu', batch_size=data_options['batch_size']) print("Backgrand: nervanagpu") except: self.be = gen_backend(backend='gpu', batch_size=data_options['batch_size']) print("Backgrand: gpu") else: self.be = gen_backend(backend='mkl', batch_size=data_options['batch_size']) self.loss = L.GeneralizedCost(costfunc=TF.CrossEntropyMulti()) B = self.data_options['batch_size'] self.model.bsz(B) C, W, H = self.data_options['image_shape'] self.model.initialize(((C, H, W), B), self.loss) def set_optimizer(self, opt_type, opt_conf): if opt_type == 'SGD': self.optimizer = GradientDescentMomentum(opt_conf['lr'], momentum_coef=opt_conf['momentum']) else: raise NotImplementedError def run(self, iterator, mode='train'): report = dict() time_series = [] start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) total_s = time.perf_counter() for idx, (x, t) in enumerate(iterator): if self.time_options == 'total': start_event.record() x = self.be.array(x) t = self.be.array(t) self.be.begin(Block.minibatch, idx) if self.time_options == 'forward': with self._record(start_event, end_event): x = self.model(x) else: x = self.model(x) self.total_cost[:] = self.total_cost + self.loss.get_cost(x, t) # deltas back propagate through layers # for every layer in reverse except the 0th one loss = self.loss.get_errors(x, t) if self.time_options == 'backward': with self._record(start_event, end_event): self.model.backward(loss) else: self.model.backward(loss) self.optimizer.optimize(self.model.layers_to_optimize, epoch=0) if self.time_options == 'total': end_event.record() torch.cuda.synchronize() self._elapsed_time = start_event.elapsed_time(end_event)/1000 if isinstance(iterator, tqdm): iterator.set_description('{:>10s} :{:10.7f}s/it'.format(self.time_options, self._elapsed_time)) time_series.append(self._elapsed_time) torch.cuda.synchronize() total_e = time.perf_counter() report = dict( time_series=time_series, total=total_e - total_s, ) return report
# hyperparameters num_epochs = args.epochs (X_train, y_train), (X_test, y_test), nclass = load_mnist(path=args.data_dir) train_set = ArrayIterator([X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28)) valid_set = ArrayIterator([X_test, X_test], y_test, nclass=nclass, lshape=(1, 28, 28)) # weight initialization init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model path1 = Sequential(layers=[Affine(nout=100, init=init_norm, activation=Rectlin()), Affine(nout=100, init=init_norm, activation=Rectlin())]) path2 = Sequential(layers=[Affine(nout=100, init=init_norm, activation=Rectlin()), Affine(nout=100, init=init_norm, activation=Rectlin())]) layers = [MergeMultistream(layers=[path1, path2], merge="stack"), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True))] model = Model(layers=layers) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) # fit and validate optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) # configure callbacks callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args) model.fit(train_set, cost=cost, optimizer=optimizer, num_epochs=num_epochs, callbacks=callbacks)
def set_optimizer(self, opt_type, opt_conf): if opt_type == 'SGD': self.optimizer = GradientDescentMomentum(opt_conf['lr'], momentum_coef=opt_conf['momentum']) else: raise NotImplementedError
parser = NeonArgparser(__doc__) args = parser.parse_args() NervanaObject.be.enable_winograd = 4 # setup data provider X_train = np.random.uniform(-1, 1, (128, 3*224*224)) y_train = np.random.uniform(-1, 1, (128, 1000)) train = ArrayIterator(X_train, y_train, nclass=1000, lshape=(3, 224, 224)) layers = [Conv((11, 11, 64), init=Gaussian(scale=0.01), activation=Rectlin(), padding=3, strides=4), Pooling(3, strides=2), Conv((5, 5, 192), init=Gaussian(scale=0.01), activation=Rectlin(), padding=2), Pooling(3, strides=2), Conv((3, 3, 384), init=Gaussian(scale=0.03), activation=Rectlin(), padding=1), Conv((3, 3, 256), init=Gaussian(scale=0.03), activation=Rectlin(), padding=1), Conv((3, 3, 256), init=Gaussian(scale=0.03), activation=Rectlin(), padding=1), Pooling(3, strides=2), Affine(nout=4096, init=Gaussian(scale=0.01), activation=Rectlin()), Affine(nout=4096, init=Gaussian(scale=0.01), activation=Rectlin()), Affine(nout=1000, init=Gaussian(scale=0.01), activation=Softmax())] model = Model(layers=layers) weight_sched = Schedule([22, 44, 65], (1/250.)**(1/3.)) opt_gdm = GradientDescentMomentum(0.01, 0.0, wdecay=0.0005, schedule=weight_sched) opt = MultiOptimizer({'default': opt_gdm}) cost = GeneralizedCost(costfunc=CrossEntropyMulti()) model.benchmark(train, cost=cost, optimizer=opt, niterations=10, nskip=5)
from neon.optimizers import GradientDescentMomentum from neon.transforms import Misclassification, CrossEntropyBinary, Logistic, Rectlin from neon.callbacks.callbacks import Callbacks from neon.util.argparser import NeonArgparser # parse the command line arguments parser = NeonArgparser(__doc__) args = parser.parse_args() (X_train, y_train), (X_test, y_test), nclass = load_cifar10(path=args.data_dir) train = ArrayIterator(X_train, y_train, nclass=nclass, lshape=(3, 32, 32)) test = ArrayIterator(X_test, y_test, nclass=nclass, lshape=(3, 32, 32)) init_uni = Uniform(low=-0.1, high=0.1) opt_gdm = GradientDescentMomentum(learning_rate=0.01, momentum_coef=0.9) # set up the model layers layers = [ Affine(nout=200, init=init_uni, activation=Rectlin()), Affine(nout=10, init=init_uni, activation=Logistic(shortcut=True)) ] cost = GeneralizedCost(costfunc=CrossEntropyBinary()) mlp = Model(layers=layers) # configure callbacks callbacks = Callbacks(mlp, eval_set=test, **args.callback_args) mlp.fit(train,
def test_model_serialize(backend): (X_train, y_train), (X_test, y_test), nclass = load_mnist() train_set = DataIterator([X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28)) init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model path1 = [ Conv((5, 5, 16), init=init_norm, bias=Constant(0), activation=Rectlin()), Pooling(2), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()) ] path2 = [ Dropout(keep=0.5), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()) ] layers = [ MergeConcat([path1, path2]), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()), BatchNorm(), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True)) ] tmp_save = 'test_model_serialize_tmp_save.pickle' mlp = Model(layers=layers) mlp.optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) mlp.cost = GeneralizedCost(costfunc=CrossEntropyBinary()) n_test = 3 num_epochs = 3 # Train model for num_epochs and n_test batches for epoch in range(num_epochs): for i, (x, t) in enumerate(train_set): x = mlp.fprop(x) delta = mlp.cost.get_errors(x, t) mlp.bprop(delta) mlp.optimizer.optimize(mlp.layers_to_optimize, epoch=epoch) if i > n_test: break # Get expected outputs of n_test batches and states of all layers outputs_exp = [] pdicts_exp = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs_exp.append(mlp.fprop(x, inference=True)) if i > n_test: break # Serialize model save_obj(mlp.serialize(keep_states=True), tmp_save) # Load model mlp = Model(layers=layers) mlp.load_weights(tmp_save) outputs = [] pdicts = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs.append(mlp.fprop(x, inference=True)) if i > n_test: break # Check outputs, states, and params are the same for output, output_exp in zip(outputs, outputs_exp): assert np.allclose(output.get(), output_exp.get()) for pd, pd_exp in zip(pdicts, pdicts_exp): for s, s_e in zip(pd['states'], pd_exp['states']): if isinstance(s, list): # this is the batch norm case for _s, _s_e in zip(s, s_e): assert np.allclose(_s, _s_e) else: assert np.allclose(s, s_e) for p, p_e in zip(pd['params'], pd_exp['params']): if isinstance(p, list): # this is the batch norm case for _p, _p_e in zip(p, p_e): assert np.allclose(_p, _p_e) else: assert np.allclose(p, p_e) os.remove(tmp_save)
activation=Rectlin(), padding=1), Conv((3, 3, 256), init=Gaussian(scale=0.03), bias=Constant(1), activation=Rectlin(), padding=1), Conv((3, 3, 256), init=Gaussian(scale=0.03), bias=Constant(1), activation=Rectlin(), padding=1), Pooling(3, strides=2), Affine(nout=4096, init=Gaussian(scale=0.01), bias=Constant(1), activation=Rectlin()), Dropout(keep=0.5), Affine(nout=4096, init=Gaussian(scale=0.01), bias=Constant(1), activation=Rectlin()), Dropout(keep=0.5), Affine(nout=1000, init=Gaussian(scale=0.01), bias=Constant(-7), activation=Softmax())] model = Model(layers=layers) # drop weights LR by 1/250**(1/3) at epochs (23, 45, 66), drop bias LR by 1/10 at epoch 45 weight_sched = Schedule([22, 44, 65], (1/250.)**(1/3.)) opt_gdm = GradientDescentMomentum(0.01, 0.9, wdecay=0.0005, schedule=weight_sched, stochastic_round=args.rounding) opt_biases = GradientDescentMomentum(0.02, 0.9, schedule=Schedule([44], 0.1), stochastic_round=args.rounding) opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_biases}) # configure callbacks valmetric = TopKMisclassification(k=5) callbacks = Callbacks(model, eval_set=test, metric=valmetric, **args.callback_args) if args.model_file is not None: model.load_params(args.model_file) if not args.test_only: cost = GeneralizedCost(costfunc=CrossEntropyMulti()) model.fit(train, optimizer=opt, num_epochs=args.epochs, cost=cost, callbacks=callbacks) mets = model.eval(test, metric=valmetric)
# layers = [Conv(fshape=(5,5,16), init=init_uni, activation=Rectlin()), # Pooling(fshape=2, strides=2), # Conv(fshape=(5,5,32), init=init_uni, activation=Rectlin()), # Pooling(fshape=2, strides=2), # Affine(nout=500, init=init_uni, activation=Rectlin()), # Affine(nout=10, init=init_uni, activation=Softmax())] # learning_rate = 0.005 # momentum = 0.9 cnn = Model(layers=layers) # - cost function cost = GeneralizedCost(costfunc=CrossEntropyMulti()) # - learning rule optimizer = GradientDescentMomentum(learning_rate, momentum_coef=momentum) # Progress bar for each epoch - what's an epoch again? by default 10 Crazy magic - don't even go here! callbacks = Callbacks(cnn, eval_set=test_set, **args.callback_args) # put everything together! cnn.fit(train_set, optimizer=optimizer, num_epochs=epochs, cost=cost, callbacks=callbacks) # # Calculate test set results # results = cnn.get_outputs(test_set) # dump(cnn, "cnn_0_005.jbl")
name='fc7')) layers.append(Dropout(keep=0.5, name='drop7')) layers.append( Affine(nout=1000, init=init_g1, bias=Constant(0.0), activation=Softmax(), name='fc8')) model = Model(layers=layers) # scale LR by 0.1 every 20 epochs (this assumes batch_size = 256) weight_sched = Schedule(20, 0.1) opt_gdm = GradientDescentMomentum(0.01, 0.9, wdecay=0.0005, schedule=weight_sched) opt_biases = GradientDescentMomentum(0.02, 0.9, schedule=weight_sched) opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_biases}) # configure callbacks valmetric = TopKMisclassification(k=5) callbacks = Callbacks(model, eval_set=test, metric=valmetric, **args.callback_args) if args.model_file is not None: model.load_params(args.model_file) if not args.test_only: cost = GeneralizedCost(costfunc=CrossEntropyMulti())
def test_gdm(args, transformer_factory): """ Test the ngraph GradientDescentMomentum against the neon version across 10 update steps. """ # set up parameters C = ng.make_axis(20, name="C") N = ng.make_axis(32, name="N", batch=True) be = gen_backend(backend='cpu', batch_size=N.length) # restrict to numpy transformer for now factory = ngt.make_transformer_factory('numpy') ngt.set_transformer_factory(factory) ngt.make_transformer() # generate dummy data (to initialize values) w_init = np.random.rand(C.length).astype('float32') # set up nervana graph X = ng.placeholder([C, N]).named('X') Y = ng.placeholder([N]).named('Y') W = ng.variable([C - 1], initial_value=w_init).named('W') ex = ExecutorFactory() transformer = ex.transformer lrate, mom, wdecay = args gdm = GradientDescentMomentum(learning_rate=lrate, momentum_coef=mom, wdecay=wdecay) cost = ng.sum(Y - ng.dot(W, X), out_axis=()) # to call ngraph gdm, use (ngraph_W, _) = ngraph_optimize(x, y) # where (x, y) are nparrays that fill the placeholders X and Y updates = gdm(cost) ngraph_optimize = transformer.computation([W, updates], X, Y) transformer.initialize() # set up the neon gdm neon_gdm = NeonGradientDescentMomentum(learning_rate=lrate, momentum_coef=mom, wdecay=wdecay) # dev_v0 = be.zeros((C.length, 1)) # velocities are zero at the beginning dev_dw = be.zeros((C.length, 1)) # we fill the gradient info in the below dev_w_init = be.array(w_init) # copy w_init to device param_list = [((dev_w_init, dev_dw), [])] # store the weights with each minibatch for debugging ng_Ws = [] be_Ws = [] # run for 20 minibatches for i, (x, y) in enumerate([generate_data(C.length, N.length) for _ in range(20)]): # obtain ngraph results (ng_W, _) = ngraph_optimize(x, y) ng_Ws.append(copy.deepcopy(ng_W)) # obtain neon results dw = -1 * x.sum(axis=1) # the gradients we compute analytically param_list[0][0][1].set(dw) # fill the gradient neon_gdm.optimize([DummyLayer(param_list)], epoch=0) (param, grad), states = param_list[0] be_W = param.get()[:, 0] be_Ws.append(be_W) np.testing.assert_allclose(be_W, ng_W, rtol=1e-3)
else: rlayer1, rlayer2 = GRU(**rlayer_params), GRU(**rlayer_params) layers = [ LookupTable(vocab_size=len(train_set.vocab), embedding_dim=hidden_size, init=init), rlayer1, rlayer2, Affine(len(train_set.vocab), init, bias=init, activation=Softmax()) ] cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True)) model = Model(layers=layers) # vanilla gradient descent with decay schedule on learning rate and gradient scaling learning_rate_sched = Schedule(list(range(5, args.epochs)), .5) optimizer = GradientDescentMomentum(1, 0, gradient_clip_norm=gradient_clip_norm, schedule=learning_rate_sched) # configure callbacks callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args) # train model model.fit(train_set, optimizer=optimizer, num_epochs=args.epochs, cost=cost, callbacks=callbacks)
args = parser.parse_args() # hyperparameters if args.datatype in [np.float16]: cost_scale = 10. num_epochs = args.epochs (X_train, y_train), (X_test, y_test), nclass = load_cifar10(path=args.data_dir) train = ArrayIterator(X_train, y_train, nclass=nclass, lshape=(3, 32, 32)) test = ArrayIterator(X_test, y_test, nclass=nclass, lshape=(3, 32, 32)) init_uni = Uniform(low=-0.1, high=0.1) if args.datatype in [np.float32, np.float64]: opt_gdm = GradientDescentMomentum(learning_rate=0.01, momentum_coef=0.9, stochastic_round=args.rounding) elif args.datatype in [np.float16]: opt_gdm = GradientDescentMomentum(learning_rate=0.01 / cost_scale, momentum_coef=0.9, stochastic_round=args.rounding) bn = True layers = [ Conv((5, 5, 16), init=init_uni, activation=Rectlin(), batch_norm=bn), Pooling((2, 2)), Conv((5, 5, 32), init=init_uni, activation=Rectlin(), batch_norm=bn), Pooling((2, 2)), Affine(nout=500, init=init_uni, activation=Rectlin(), batch_norm=bn), Affine(nout=10, init=init_uni, activation=Softmax()) ]
batch_norm=True, activation=Rectlin())) layers.append(Affine(1, init=Kaiming(local=False), activation=Logistic())) #return Model(layers=layers), GeneralizedCost(costfunc=CrossEntropyBinary()) return Model(layers=layers), GeneralizedCost(costfunc=CrossEntropyBinary()) lunaModel, cost = create_network(args.depth) modelFileName = 'LUNA16_resnet.prm' # If model file exists, then load the it and start from there. # if (os.path.isfile(modelFileName)): # lunaModel = Model(modelFileName) weight_sched = Schedule([30, 60], 0.1) opt = GradientDescentMomentum(0.1, 0.9, wdecay=0.0001, schedule=weight_sched) # configure callbacks if args.callback_args['eval_freq'] is None: args.callback_args['eval_freq'] = 1 # configure callbacks callbacks = Callbacks(lunaModel, eval_set=valid_set, **args.callback_args) # add a callback that saves the best model state callbacks.add_save_best_state_callback(modelFileName) lunaModel.fit(train_set, optimizer=opt, num_epochs=num_epochs, cost=cost, callbacks=callbacks)
Affine(nout=16, linear_name="b1_l1", **normrelu), Affine(nout=10, linear_name="b1_l2", **normsigm)] p3 = [b2, Affine(nout=16, linear_name="b2_l1", **normrelu), Affine(nout=10, linear_name="b2_l2", **normsigm)] # setup cost function as CrossEntropy cost = Multicost(costs=[GeneralizedCost(costfunc=CrossEntropyMulti()), GeneralizedCost(costfunc=CrossEntropyBinary()), GeneralizedCost(costfunc=CrossEntropyBinary())], weights=[1, 0., 0.]) # setup optimizer optimizer = GradientDescentMomentum(0.1, momentum_coef=0.9, stochastic_round=args.rounding) # initialize model object alphas = [1, 0.25, 0.25] mlp = Model(layers=Tree([p1, p2, p3], alphas=alphas)) # setup standard fit callbacks callbacks = Callbacks(mlp, train_set, eval_set=valid_set, **args.callback_args) # run fit mlp.fit(train_set, optimizer=optimizer, num_epochs=args.epochs, cost=cost, callbacks=callbacks) logging.getLogger('neon').info("Misclassification error = %.1f%%", (mlp.eval(valid_set, metric=Misclassification())*100)) print('Misclassification error = %.1f%%' % (mlp.eval(valid_set, metric=Misclassification())*100))
help='subset of training dataset to use (percentage)') args = parser.parse_args() model, cost = create_network() rseed = 0 if args.rng_seed is None else args.rng_seed # setup data provider assert 'train' in args.manifest, "Missing train manifest" assert 'val' in args.manifest, "Missing validation manifest" train = make_alexnet_train_loader(args.manifest['train'], args.manifest_root, model.be, args.subset_pct, rseed) valid = make_validation_loader(args.manifest['val'], args.manifest_root, model.be, args.subset_pct) sched_weight = Schedule([10], change=0.1) opt = GradientDescentMomentum(0.01, 0.9, wdecay=0.0005, schedule=sched_weight) # configure callbacks valmetric = TopKMisclassification(k=5) callbacks = Callbacks(model, eval_set=valid, metric=valmetric, **args.callback_args) if args.deconv: callbacks.add_deconv_callback(train, valid) model.fit(train, optimizer=opt, num_epochs=args.epochs, cost=cost,
img_set_options = dict(repo_dir=args.data_dir, inner_size=224, dtype=args.datatype, subset_pct=100) train = img_provider(set_name='train', **img_set_options) test = img_provider(set_name='validation', do_transforms=False, **img_set_options) train.init_batch_provider() test.init_batch_provider() relu = Rectlin() init_uni = GlorotUniform() # The parameters below are straight out of [Springenberg2014] opt_gdm = GradientDescentMomentum(learning_rate=0.01, schedule=Schedule(step_config=[10], change=0.1), momentum_coef=0.9, wdecay=.0005) # set up model layers layers = [] layers.append(DataTransform(transform=Normalizer(divisor=128.))) layers.append(Conv((11, 11, 96), init=init_uni, activation=relu, strides=4, padding=1)) layers.append(Conv((1, 1, 96), init=init_uni, activation=relu, strides=1)) layers.append(Conv((3, 3, 96), init=init_uni, activation=relu, strides=2, padding=1)) # 54->27 layers.append(Conv((5, 5, 256), init=init_uni, activation=relu, strides=1)) # 27->23 layers.append(Conv((1, 1, 256), init=init_uni, activation=relu, strides=1)) layers.append(Conv((3, 3, 256), init=init_uni, activation=relu, strides=2, padding=1)) # 23->12
# setup training dataset train_set = PASCALVOCTrain('trainval', '2007', path=args.data_dir, n_mb=n_mb, img_per_batch=img_per_batch, rois_per_img=rois_per_img, rois_random_sample=True, add_flipped=False, subset_pct=args.subset_pct) test_set = PASCALVOCTrain('test', '2007', path=args.data_dir, n_mb=n_mb, img_per_batch=img_per_batch, rois_per_img=rois_per_img, rois_random_sample=True, add_flipped=False) # setup model model = create_frcn_model(frcn_fine_tune) # setup optimizer opt_w = GradientDescentMomentum( 0.001 * learning_rate_scale, 0.9, wdecay=0.0005) opt_b = GradientDescentMomentum(0.002 * learning_rate_scale, 0.9) optimizer = MultiOptimizer({'default': opt_w, 'Bias': opt_b}) # if training a new model, seed the image model conv layers with pre-trained weights # otherwise, just load the model file if args.model_file is None: load_vgg_weights(model, args.data_dir) cost = Multicost(costs=[GeneralizedCostMask(costfunc=CrossEntropyMulti()), GeneralizedCostMask(costfunc=SmoothL1Loss())], weights=[1, 1]) callbacks = Callbacks(model, eval_set=test_set, **args.callback_args)