def create_loss_and_learner(model, labels, learning_rate, momentum_coef=0.0, wdecay=0.0, nesterov=False, gradient_clip_norm=None, gradient_clip_value=None): """ Auxiliary function to create loss function (cross entropy and softmax) and trainer using stochastic gradient descent with momentum. Arguments: model - imported model labels - placeholder for one-hot labels array learning_rate - learning rate for trainer momentum_coef - coefficient of momentum (deafult 0.0) wdecay - amount of weight decay (default 0.0) nesterov - use nesterov accelerated gradient (dafault False) gradient_clip_norm - target gradient norm (default None) gradient_clip_value - value to element-wise clip gradients (default None) Returns: Loss function (mean for batch) """ if model.axes.lengths != labels.axes.lengths: labels = ng.Transpose(labels) assert model.axes.lengths == labels.axes.lengths model = ng.cast_axes(model, axes=labels.axes) loss = ng.cross_entropy_multi(ng.softmax(model), labels) optimizer = GradientDescentMomentum(learning_rate, momentum_coef, wdecay, gradient_clip_norm, gradient_clip_value, nesterov) return ng.sequential([optimizer(loss), ng.mean(loss, out_axes=())])
def make_optimizer(name=None): learning_rate = 0.005 if minibatch_discrimination else 0.03 optimizer = GradientDescentMomentum(learning_rate, momentum_coef=0.0, wdecay=0.0, gradient_clip_norm=None, gradient_clip_value=None, name=name) return optimizer
def run_mini_ds2_benchmark(args, **kwargs): device_id = kwargs.get('device_id') inputs, train_set, eval_set = generate_ds2_data(args.max_length, args.str_w, args.nout, args.nbands, args.batch_size, args.num_iterations) model_out = get_mini_ds2(inputs, args.nfilters, args.filter_width, args.str_w, args.nbands, args.depth, args.hidden_size, args.batch_norm, args.hetr_device, device_id) if args.bprop: with ng.metadata(device=args.hetr_device, device_id=device_id, parallel=ax.N): loss = ng.ctc(model_out, ng.flatten(inputs["char_map"]), inputs["audio_length"], inputs["trans_length"]) optimizer = GradientDescentMomentum(learning_rate=2e-5, momentum_coef=0.99, gradient_clip_norm=400, nesterov=args.nesterov) updates = optimizer(loss) mean_cost = ng.sequential([updates, ng.mean(loss, out_axes=())]) bprop_computation_op = ng.computation(mean_cost, "all") benchmark = Benchmark(bprop_computation_op, train_set, inputs, args.backend, args.hetr_device) Benchmark.print_benchmark_results( benchmark.time(args.num_iterations, args.skip_iter, 'ds2_bprop', args.visualize, preprocess=True)) else: fprop_computation_op = ng.computation(model_out, "all") benchmark_fprop = Benchmark(fprop_computation_op, train_set, inputs, args.backend, args.hetr_device) Benchmark.print_benchmark_results( benchmark_fprop.time(args.num_iterations, args.skip_iter, 'ds2_fprop', args.visualize, preprocess=True))
def test_gdm(random_learning_rate, random_momentum_coef, wdecay, nesterov): # Setup the baseline and reference optimizers to be tested gdm_args = { 'learning_rate': random_learning_rate, 'momentum_coef': random_momentum_coef, 'wdecay': wdecay, 'nesterov': nesterov } gdm_ref = GDMReference(**gdm_args) gdm = GradientDescentMomentum(**gdm_args) # test baseline against reference compare_optimizer(gdm, gdm_ref)
def test_gdm(random_learning_rate, random_momentum_coef, wdecay, nesterov, transformer_factory): # Setup the baseline and reference optimizers to be tested gdm_args = { 'learning_rate': random_learning_rate, 'momentum_coef': random_momentum_coef, 'wdecay': wdecay, 'nesterov': nesterov } gdm_reference = GDMReference(**gdm_args) gdm = GradientDescentMomentum(**gdm_args) # Set up data placeholders C = ng.make_axis(20) N = ng.make_axis(32, name='N') data = ng.placeholder([C, N]) target = ng.placeholder([N]) # params to be updated using GDM np_W = np.random.rand(C.length) W = ng.variable([C], initial_value=np_W) # Set up op graph cost = ng.sum(target - ng.dot(W, data), out_axis=()) updated_weights = ng.sequential([gdm(cost), W]) def data_generator(iteration_count): for i in range(iteration_count): yield (np.random.rand(C.length, N.length).astype('float32'), np.random.rand(N.length).astype('float32')) # Set up the computation and run the "train" loop with ExecutorFactory() as ex: gdm_baseline = ex.transformer.computation(updated_weights, data, target) mock_dataset = data_generator(20) for x, y in mock_dataset: ng_W = gdm_baseline(x, y) # updated weights for ngraph optimizer np_W = gdm_reference( x, np_W) # updated weights for reference optimizer ng.testing.assert_allclose(np_W, ng_W, rtol=1e-3)
def run_resnet_benchmark(dataset, num_iterations, n_skip, batch_size, device_id, transformer_type, device, bprop=True, batch_norm=False, visualize=False, stage_depth=1): inputs, data, train_set = get_fake_data(dataset, batch_size, num_iterations) # Running forward propagation model_out = get_mini_resnet(inputs, dataset, device, device_id, batch_norm=batch_norm, stage_depth=stage_depth) # Running back propagation if bprop: with ng.metadata(device=device, device_id=device_id, parallel=ax.N): optimizer = GradientDescentMomentum(0.01, 0.9) train_loss = ng.cross_entropy_multi( model_out, ng.one_hot(inputs['label'], axis=ax.Y)) batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) batch_cost_computation_op = ng.computation(batch_cost, "all") benchmark = Benchmark(batch_cost_computation_op, train_set, inputs, transformer_type, device) Benchmark.print_benchmark_results( benchmark.time(num_iterations, n_skip, dataset + '_msra_bprop', visualize, 'device_id')) else: fprop_computation_op = ng.computation(model_out, 'all') benchmark = Benchmark(fprop_computation_op, train_set, inputs, transformer_type, device) Benchmark.print_benchmark_results( benchmark.time(num_iterations, n_skip, dataset + '_msra_fprop', visualize))
def run_cifar_benchmark(n_iter=10, n_skip=5, batch_size=4, transformer_type='cpu'): inputs, data, train_set = get_fake_cifar(batch_size, n_iter) model = get_mini_resnet(inputs) optimizer = GradientDescentMomentum(0.01, 0.9) train_loss = ng.cross_entropy_multi(model(inputs['image']), ng.one_hot(inputs['label'], axis=ax.Y)) batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) batch_cost_computation_op = ng.computation(batch_cost, "all") feed_dict = fill_feed_dict(train_set, inputs) benchmarks = dict() benchmarks['cifar_msra_fprop'] = run_benchmark(batch_cost_computation_op, transformer_type, feed_dict, n_skip, n_iter) print_benchmark_results(benchmarks)
def run_resnet_benchmark(dataset, n_iter, n_skip, batch_size, device_id, transformer_type, device, bprop=False, visualize=False): inputs, data, train_set = get_fake_data(dataset, batch_size, n_iter) model_out = get_mini_resnet(inputs, dataset, device_id) # Running forward propagation fprop_computation_op = ng.computation(model_out, 'all') benchmark_fprop = Benchmark(fprop_computation_op, train_set, inputs, transformer_type, device) Benchmark.print_benchmark_results(benchmark_fprop.time(n_iter, n_skip, dataset + '_msra_fprop', visualize)) # Running back propagation if bprop: optimizer = GradientDescentMomentum(0.01, 0.9) train_loss = ng.cross_entropy_multi(model_out, ng.one_hot(inputs['label'], axis=ax.Y)) batch_cost = ng.sequential([optimizer(train_loss), ng.mean(train_loss, out_axes=())]) batch_cost_computation_op = ng.computation(batch_cost, "all") benchmark = Benchmark(batch_cost_computation_op, train_set, inputs, transformer_type, device) Benchmark.print_benchmark_results(benchmark.time(n_iter, n_skip, dataset + '_msra_bprop', visualize))
# we need to ask the dataset to create an iteration # placeholder for our learning rate schedule inputs = train_set.make_placeholders(include_iteration=True) ax.Y.length = 10 resnet = residual_network(args.stage_depth) learning_rate_policy = { 'name': 'schedule', 'schedule': [32000, 48000], 'gamma': 0.1, 'base_lr': 0.1 } optimizer = GradientDescentMomentum(learning_rate=learning_rate_policy, momentum_coef=0.9, wdecay=0.0001, iteration=inputs['iteration']) label_indices = inputs['label'] train_loss = ng.cross_entropy_multi(resnet(inputs['image']), ng.one_hot(label_indices, axis=ax.Y)) batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) train_computation = ng.computation(batch_cost, "all") with Layer.inference_mode_on(): inference_prob = resnet(inputs['image']) errors = ng.not_equal(ng.argmax(inference_prob, out_axes=[ax.N]), label_indices) eval_loss = ng.cross_entropy_multi( inference_prob, ng.one_hot(label_indices, axis=ax.Y))
def train_mnist_mlp(transformer_name, data_dir=None, rng_seed=12, batch_size=128, train_iter=10, eval_iter=10): assert transformer_name in ['cpu', 'hetr'] assert isinstance(rng_seed, int) # Apply this metadata to graph regardless of transformer, # but it is ignored for non-HeTr case hetr_device_ids = (0, 1) # use consistent rng seed between runs np.random.seed(rng_seed) # Data train_data, valid_data = MNIST(path=data_dir).load_data() train_set = ArrayIterator(train_data, batch_size, total_iterations=train_iter) valid_set = ArrayIterator(valid_data, batch_size) inputs = train_set.make_placeholders() ax.Y.length = 10 # Model with ng.metadata(device_id=hetr_device_ids, parallel=ax.N): seq1 = Sequential([ Preprocess(functor=lambda x: x / 255.), Affine(nout=100, weight_init=GaussianInit(), activation=Rectlin()), Affine(axes=ax.Y, weight_init=GaussianInit(), activation=Logistic()) ]) train_prob = seq1(inputs['image']) train_loss = ng.cross_entropy_binary( train_prob, ng.one_hot(inputs['label'], axis=ax.Y)) optimizer = GradientDescentMomentum(0.1, 0.9) batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) train_outputs = dict(batch_cost=batch_cost) with Layer.inference_mode_on(): inference_prob = seq1(inputs['image']) errors = ng.not_equal(ng.argmax(inference_prob, out_axes=[ax.N]), inputs['label']) eval_loss = ng.cross_entropy_binary( inference_prob, ng.one_hot(inputs['label'], axis=ax.Y)) eval_outputs = dict(cross_ent_loss=eval_loss, misclass_pct=errors) # Runtime with closing( ngt.make_transformer_factory(transformer_name)()) as transformer: train_computation = make_bound_computation(transformer, train_outputs, inputs) loss_computation = make_bound_computation(transformer, eval_outputs, inputs) train_costs = list() for step in range(train_iter): out = train_computation(next(train_set)) train_costs.append(float(out['batch_cost'])) ce_loss = list() for step in range(eval_iter): out = loss_computation(next(valid_set)) ce_loss.append(np.mean(out['cross_ent_loss'])) return train_costs, ce_loss
def test_gdm(args, transformer_factory): """ Test the ngraph GradientDescentMomentum against the neon version across 10 update steps. """ # set up parameters C = ng.make_axis(20, name="C") N = ng.make_axis(32, name="N", batch=True) be = gen_backend(backend='cpu', batch_size=N.length) # restrict to numpy transformer for now factory = ngt.make_transformer_factory('numpy') ngt.set_transformer_factory(factory) ngt.make_transformer() # generate dummy data (to initialize values) w_init = np.random.rand(C.length).astype('float32') # set up nervana graph X = ng.placeholder([C, N]).named('X') Y = ng.placeholder([N]).named('Y') W = ng.variable([C - 1], initial_value=w_init).named('W') ex = ExecutorFactory() transformer = ex.transformer lrate, mom, wdecay = args gdm = GradientDescentMomentum(learning_rate=lrate, momentum_coef=mom, wdecay=wdecay) cost = ng.sum(Y - ng.dot(W, X), out_axis=()) # to call ngraph gdm, use (ngraph_W, _) = ngraph_optimize(x, y) # where (x, y) are nparrays that fill the placeholders X and Y updates = gdm(cost) ngraph_optimize = transformer.computation([W, updates], X, Y) transformer.initialize() # set up the neon gdm neon_gdm = NeonGradientDescentMomentum(learning_rate=lrate, momentum_coef=mom, wdecay=wdecay) # dev_v0 = be.zeros((C.length, 1)) # velocities are zero at the beginning dev_dw = be.zeros((C.length, 1)) # we fill the gradient info in the below dev_w_init = be.array(w_init) # copy w_init to device param_list = [((dev_w_init, dev_dw), [])] # store the weights with each minibatch for debugging ng_Ws = [] be_Ws = [] # run for 20 minibatches for i, (x, y) in enumerate([generate_data(C.length, N.length) for _ in range(20)]): # obtain ngraph results (ng_W, _) = ngraph_optimize(x, y) ng_Ws.append(copy.deepcopy(ng_W)) # obtain neon results dw = -1 * x.sum(axis=1) # the gradients we compute analytically param_list[0][0][1].set(dw) # fill the gradient neon_gdm.optimize([DummyLayer(param_list)], epoch=0) (param, grad), states = param_list[0] be_W = param.get()[:, 0] be_Ws.append(be_W) np.testing.assert_allclose(be_W, ng_W, rtol=1e-3)
Affine(nout=500, weight_init=init_uni, activation=Rectlin()), Affine(axes=ax.Y, weight_init=init_uni, activation=Softmax()) ]) ###################### # Input specification ax.C.length, ax.H.length, ax.W.length = train_set.shapes['image'] ax.D.length = 1 ax.N.length = args.batch_size ax.Y.length = 10 # placeholders with descriptive names inputs = dict(image=ng.placeholder([ax.C, ax.H, ax.W, ax.N]), label=ng.placeholder([ax.N])) optimizer = GradientDescentMomentum(0.01, 0.9) output_prob = seq1.train_outputs(inputs['image']) errors = ng.not_equal(ng.argmax(output_prob, out_axes=[ax.N]), inputs['label']) loss = ng.cross_entropy_multi(output_prob, ng.one_hot(inputs['label'], axis=ax.Y)) mean_cost = ng.mean(loss, out_axes=()) updates = optimizer(loss) train_outputs = dict(batch_cost=mean_cost, updates=updates) loss_outputs = dict(cross_ent_loss=loss, misclass_pct=errors) # Now bind the computations we are interested in transformer = ngt.make_transformer() train_computation = make_bound_computation(transformer, train_outputs, inputs) loss_computation = make_bound_computation(transformer, loss_outputs, inputs)
activation=Rectlin()), Affine(axes=ax.Y, weight_init=GaussianInit(var=0.01), bias_init=init, activation=Softmax()) ]) # Learning rate change based on schedule from learning_rate_policies.py lr_schedule = { 'name': 'schedule', 'base_lr': 0.01, 'gamma': (1 / 250.)**(1 / 3.), 'schedule': [22, 44, 65] } optimizer = GradientDescentMomentum(lr_schedule, 0.0, wdecay=0.0005, iteration=inputs['iteration']) train_prob = seq1(inputs['image']) train_loss = ng.cross_entropy_multi(train_prob, ng.one_hot(inputs['label'], axis=ax.Y)) batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) train_computation = ng.computation(batch_cost, "all") with closing(ngt.make_transformer()) as transformer: train_function = transformer.add_computation(train_computation) if args.no_progress_bar: ncols = 0 else:
train = seq1(input_ops_train['image']) tb = TensorBoard("/tmp/") tb.add_graph(train) exit() # Learning Rate Placeholder lr_ph = ng.placeholder(axes=(), initial_value=base_lr) # Optimizer # Provided learning policy takes learning rate as input to graph using a placeholder. # This allows you to control learning rate based on various factors of network learning_rate_policy = {'name': 'provided', 'lr_placeholder': lr_ph} optimizer = GradientDescentMomentum(learning_rate=learning_rate_policy, momentum_coef=momentum_coef, wdecay=wdecay, nesterov=False, iteration=input_ops_train['iteration']) # Make a prediction prediction = resnet(input_ops_train['image']) # Calculate loss train_loss = ng.cross_entropy_multi( prediction, ng.one_hot(input_ops_train['label'], axis=ax.Y)) # Average loss over the batch batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) train_computation = ng.computation(batch_cost, "all") # Instantiate the Saver object to save weights weight_saver = Saver()
args.filter_width, args.str_w, nbands, args.depth, args.hidden_size, batch_norm=args.batch_norm) output = ds2(inputs["audio"], spatial_axes={"H": "frequency", "W": "time"}) # set up ctc loss loss = ng.ctc(output, ng.flatten(inputs["char_map"]), ng.flatten(inputs["audio_length"]), ng.flatten(inputs["char_map_length"])) optimizer = GradientDescentMomentum( args.lr, momentum_coef=args.momentum, gradient_clip_norm=args.gradient_clip_norm, nesterov=args.nesterov) start = time.time() updates = optimizer(loss) stop = time.time() logger.debug("Optimizer graph creation took {} seconds".format(stop - start)) mean_cost = ng.sequential([updates, ng.mean(loss, out_axes=())]) # Create computation and initialize the transformer to allocate weights train_computation = ng.computation([mean_cost, output], "all") if inference is True: with Layer.inference_mode_on(): eval_output = ds2(inputs["audio"],
'W': 'REC' })] + [affine_layer]) elif args.modeltype == "LSTM": model = Sequential( recurrent_model.define_model(out_axis, celltype=args.modeltype, recurrent_units=hidden_sizes, return_sequence=True).layers + [Logistic()]) # Optimizer if args.modeltype == "TCN": optimizer = Adam(learning_rate=args.lr, gradient_clip_value=args.grad_clip_value) else: optimizer = GradientDescentMomentum( learning_rate=args.lr, gradient_clip_value=args.grad_clip_value) # Define the loss function (categorical cross entropy, since each musical key on the piano is encoded as a binary value) fwd_prop = model(inputs['X']) fwd_prop = ng.axes_with_order(fwd_prop, out_axes) train_loss = ng.cross_entropy_binary(fwd_prop, inputs['y']) with Layer.inference_mode_on(): preds = model(inputs['X']) preds = ng.axes_with_order(preds, out_axes) eval_loss = ng.mean(ng.cross_entropy_binary(preds, inputs['y']), out_axes=()) eval_computation = ng.computation([eval_loss], "all") predict_computation = ng.computation([preds], "all") # Cost calculation batch_cost = ng.sequential(
# Optimizer # Initial learning rate is 0.01 (base_lr) # At iteration (num_iterations // 75), lr is multiplied by gamma (new lr = .95 * .01) # At iteration (num_iterations * 2 // 75), it is reduced by gamma again # So on.. no_steps = 75 step = num_iterations // no_steps schedule = list(np.arange(step, num_iterations, step)) learning_rate_policy = { 'name': 'schedule', 'schedule': schedule, 'gamma': 0.95, 'base_lr': 0.01 } optimizer = GradientDescentMomentum(learning_rate=learning_rate_policy, iteration=inputs['iteration']) # Define the loss function (Cross entropy loss) # Note that we convert the integer values of input['y'] to one hot here fwd_prop = seq1(inputs['X']) train_loss = ng.cross_entropy_multi(fwd_prop, ng.one_hot(inputs['y'], axis=out_axis), usebits=True) # Train cost computation batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) train_computation = ng.computation([batch_cost, fwd_prop], "all") train_outputs = dict(batch_cost=batch_cost) # Forward prop of evaluation set
def train_network(model, train_set, valid_set, batch_size, epochs, log_file): ''' Trains the predefined network. Trains the model and saves the progress in the log file that is defined in the arguments model(object): Defines the model in Neon train_set(object): Defines the training set valid_set(object): Defines the validation set args(object): Training arguments batch_size(int): Minibatch size epochs(int): Number of training epoch log_file(string): File name to store trainig logs for plotting ''' # Form placeholders for inputs to the network # Iterations needed for learning rate schedule inputs = train_set.make_placeholders(include_iteration=True) # Convert labels into one-hot vectors one_hot_label = ng.one_hot(inputs['label'], axis=ax.Y) learning_rate_policy = { 'name': 'schedule', 'schedule': list(np.arange(2, epochs, 2)), 'gamma': 0.6, 'base_lr': 0.001 } optimizer = GradientDescentMomentum(learning_rate=learning_rate_policy, momentum_coef=0.9, wdecay=0.005, iteration=inputs['iteration']) # Define graph for training train_prob = model(inputs['video']) train_loss = ng.cross_entropy_multi(train_prob, one_hot_label) batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) with closing(ngt.make_transformer()) as transformer: # Define graph for calculating validation set error and misclassification rate # Use inference mode for validation to avoid dropout in forward pass with Layer.inference_mode_on(): inference_prob = model(inputs['video']) errors = ng.not_equal(ng.argmax(inference_prob), inputs['label']) eval_loss = ng.cross_entropy_multi(inference_prob, one_hot_label) eval_outputs = {'cross_ent_loss': eval_loss, 'misclass': errors} eval_computation = make_bound_computation(transformer, eval_outputs, inputs) train_outputs = {'batch_cost': batch_cost} train_computation = make_bound_computation(transformer, train_outputs, inputs) interval_cost = 0.0 # Train in epochs logs = {'train': [], 'validation': [], 'misclass': []} for epoch in trange(epochs, desc='Epochs'): # Setup the training bar numBatches = train_set.ndata // batch_size tpbar = tqdm(unit='batches', ncols=100, total=numBatches, leave=False) train_set.reset() valid_set.reset() train_log = [] for step, data in enumerate(train_set): data = dict(data) data['iteration'] = epoch # learning schedule based on epochs output = train_computation(data) train_log.append(float(output['batch_cost'])) tpbar.update(1) tpbar.set_description("Training {:0.4f}".format( float(output['batch_cost']))) interval_cost += float(output['batch_cost']) tqdm.write("Epoch {epch} complete. " "Avg Train Cost {cost:0.4f}".format(epch=epoch, cost=interval_cost / step)) interval_cost = 0.0 tpbar.close() validation_loss = run_validation(valid_set, eval_computation) tqdm.write("Avg losses: {}".format(validation_loss)) logs['train'].append(train_log) logs['validation'].append(validation_loss['cross_ent_loss']) logs['misclass'].append(validation_loss['misclass']) # Save log data and plot at the end of each epoch with open(log_file, 'wb') as f: pickle.dump(logs, f) plot_logs(logs=logs)
inputs = train_set.make_placeholders(include_iteration=args.use_lr_decay) ax.Y.length = args.num_classes layers = make_layers(args.use_large, dbpedia_dataset.vocab_size) seq = Sequential(layers) if args.use_lr_decay: lr_schedule = [(i + 1) * 3 * train_set.nbatches for i in range(10)] lr_policy = { 'name': 'schedule', 'base_lr': args.lr, 'schedule': lr_schedule, 'gamma': 0.5 } optimizer = GradientDescentMomentum(lr_policy, momentum_coef=args.momentum, iteration=inputs['iteration'], wdecay=args.weight_decay) else: optimizer = GradientDescentMomentum(args.lr, momentum_coef=args.momentum, wdecay=args.weight_decay) train_prob = seq(inputs['text']) train_loss = ng.cross_entropy_multi(train_prob, ng.one_hot(inputs['label'], axis=ax.Y)) batch_cost = ng.sequential( [optimizer(train_loss), ng.mean(train_loss, out_axes=())]) train_outputs = dict(batch_cost=batch_cost) with Layer.inference_mode_on():