def main(): import configurations from stream import DStream logger = logging.getLogger(__name__) cfig = getattr(configurations, 'get_config_penn')() rnnlm = Rnnlm(cfig['vocabsize'], cfig['nemb'], cfig['nhids']) rnnlm.weights_init = IsotropicGaussian(0.1) rnnlm.biases_init = Constant(0.) rnnlm.push_initialization_config() rnnlm.generator.transition.weights_init = Orthogonal() sentence = tensor.lmatrix('sentence') sentence_mask = tensor.matrix('sentence_mask') batch_cost = rnnlm.cost(sentence, sentence_mask).sum() batch_size = sentence.shape[1].copy(name='batch_size') cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) for brick in model.get_top_bricks(): brick.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_epochs=1000), Printing(every_n_batches=1)] train_stream = DStream(datatype='train', config=cfig) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(named_copy(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(named_copy(aggregation.mean( var, recognizer.labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result
def apply(self, input_, application_call): V = self.parameters[0] mean_row_mean = mean(input_.mean(axis=1).sum(), input_.shape[0]) application_call.add_auxiliary_variable((V ** 2).sum(), name="V_squared") application_call.add_auxiliary_variable(mean_row_mean, name="mean_row_mean") application_call.add_auxiliary_variable(input_.mean(), name="mean_batch_element") return input_ + V
def train(self): print "Loading data" datafile = self.get_datafile() nbexamples = datafile.num_examples nbexamples -= nbexamples%(self.sequence_dim*self.time_dim) train_stream = ReshapeTransformer( DataStream( dataset=datafile, iteration_scheme=ShuffledBatchChunkScheme( nbexamples, self.sequence_dim*self.time_dim)), self.sequence_dim, self.time_dim) if self.image_size is not None : train_stream = Mapping(train_stream, spec_mapping, add_sources=['spectrogram']) print "Building Theano Graph" algorithm, self.fprop = self.build_theano_functions() main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=self.model, extensions=[ FinishAfter(after_n_epochs=EPOCHS), TrainingDataMonitoring( [aggregation.mean(self.model.outputs[0])], prefix="train", after_epoch=True), Printing(), SaveParams(EXP_PATH+NAME, after_epoch=True) ]) main_loop.run()
def build_model(alphabet_size, config): layers = config['lstm_layers'] dimensions = [config['lstm_dim_' + str(i)] for i in range(layers)] uniform_width = config['lstm_init_width'] stack = [] for dim in dimensions: stack.append(LSTM(dim=dim, use_bias=True, weights_init = Uniform(width=uniform_width), forget_init=Constant(1.))) recurrent_stack = RecurrentStack(stack, name='transition') readout = Readout(readout_dim=alphabet_size, source_names=['states#' + str(layers - 1)], emitter=SoftmaxEmitter(name='emitter'), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name='feedback'), name='readout') generator = SequenceGenerator(readout=readout, transition=recurrent_stack, weights_init=Uniform(width=uniform_width), biases_init=Constant(0), name='generator') generator.push_initialization_config() generator.initialize() x = tensor.lmatrix('features') mask = tensor.fmatrix('features_mask') cost_matrix = generator.cost_matrix(x, mask=mask) log2e = math.log(math.e, 2) if 'batch_length' in config: length = config['batch_length'] - config['batch_overlap'] cost = log2e * aggregation.mean(cost_matrix[:,-length:].sum(), mask[:,-length:].sum()) else: cost = log2e * aggregation.mean(cost_matrix[:,:].sum(), mask[:,:].sum()) cost.name = 'bits_per_character' return generator, cost
def cost(self, image_vects, chars): # shape (batch, features) image_embedding = self.image_embedding.apply(image_vects) cost = aggregation.mean( self.generator.cost_matrix( chars, cnn_context=image_embedding).sum() , chars.shape[1] ) return cost
def main(save_to, num_epochs, bokeh=False): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if bokeh: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), model=Model(cost), extensions=extensions) main_loop.run()
def test_training_data_monitoring(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') V = shared_floatx(7, name='V') W_sum = named_copy(W.sum(), 'W_sum') cost = ((x * W).sum() - y) ** 2 cost.name = 'cost' class TrueCostExtension(TrainingExtension): def before_batch(self, data): self.main_loop.log.current_row['true_cost'] = ( ((W.get_value() * data["features"]).sum() - data["targets"]) ** 2) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=GradientDescent(cost=cost, params=[W], step_rule=Scale(0.001)), extensions=[ FinishAfter(after_n_epochs=1), TrainingDataMonitoring([W_sum, cost, V], prefix="train1", after_batch=True), TrainingDataMonitoring([aggregation.mean(W_sum), cost], prefix="train2", after_epoch=True), TrueCostExtension()]) main_loop.run() # Check monitoring of a shared varible assert_allclose(main_loop.log.current_row['train1_V'], 7.0) for i in range(n_batches): # The ground truth is written to the log before the batch is # processed, where as the extension writes after the batch is # processed. This is why the iteration numbers differs here. assert_allclose(main_loop.log[i]['true_cost'], main_loop.log[i + 1]['train1_cost']) assert_allclose( main_loop.log[n_batches]['train2_cost'], sum([main_loop.log[i]['true_cost'] for i in range(n_batches)]) / n_batches) assert_allclose( main_loop.log[n_batches]['train2_W_sum'], sum([main_loop.log[i]['train1_W_sum'] for i in range(1, n_batches + 1)]) / n_batches)
def main(save_to, num_epochs, batch_size): mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tt.tensor4('features', dtype='float32') y = tt.vector('label', dtype='int32') probs = mlp.apply(x.reshape((-1,3072))) cost = CategoricalCrossEntropy().apply(y, probs) error_rate = MisclassificationRate().apply(y, probs) cg = ComputationGraph([cost]) ws = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * sum(([(w**2).sum() for w in ws])) cost.name = 'final_cost' train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=True) valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=False) train_stream = train_dataset.get_stream(batch_size) valid_stream = valid_dataset.get_stream(batch_size) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) extensions = [Timing(), LogExtension('/home/belohlavek/ALI/mlp.log'), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] main_loop = MainLoop(algorithm, train_stream, model=Model(cost), extensions=extensions) main_loop.run()
def build_and_run(label, config): ############## CREATE THE NETWORK ############### #Define the parameters num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config[ 'num_epochs'], config['num_batches'], config['num_channels'], config[ 'image_shape'], config['filter_size'], config[ 'num_filter'], config['pooling_sizes'], config[ 'mlp_hiddens'], config['output_size'], config[ 'batch_size'], config['activation'], config[ 'mlp_activation'] # print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation) lambda_l1 = 0.000025 lambda_l2 = 0.000025 print("Building model") #Create the symbolics variable x = T.tensor4('image_features') y = T.lmatrix('targets') #Get the parameters conv_parameters = zip(filter_size, num_filter) #Create the convolutions layers conv_layers = list( interleave([(Convolutional(filter_size=filter_size, num_filters=num_filter, name='conv_{}'.format(i)) for i, (filter_size, num_filter) in enumerate(conv_parameters)), (activation), (MaxPooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) # (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) #Create the sequence conv_sequence = ConvolutionalSequence(conv_layers, num_channels, image_size=image_shape, weights_init=Uniform(width=0.2), biases_init=Constant(0.)) #Initialize the convnet conv_sequence.initialize() #Add the MLP top_mlp_dims = [np.prod(conv_sequence.get_dim('output')) ] + mlp_hiddens + [output_size] out = Flattener().apply(conv_sequence.apply(x)) mlp = MLP(mlp_activation, top_mlp_dims, weights_init=Uniform(0, 0.2), biases_init=Constant(0.)) #Initialisze the MLP mlp.initialize() #Get the output predict = mlp.apply(out) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') ########### REGULARIZATION ################## cg = ComputationGraph([cost]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) biases = VariableFilter(roles=[BIAS])(cg.variables) # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer l2_penalty = T.sum([ lambda_l2 * (W**2).sum() for i, W in enumerate(weights + biases) ]) # Gradually increase penalty for layer # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases]) # # #l2_penalty = l2_penalty_weights + l2_penalty_bias l2_penalty.name = 'l2_penalty' l1_penalty = T.sum([lambda_l1 * T.abs_(z).sum() for z in weights + biases]) # l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer # l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases]) # l1_penalty = l1_penalty_biases + l1_penalty_weights l1_penalty.name = 'l1_penalty' costreg = cost + l2_penalty + l1_penalty costreg.name = 'costreg' ########### DEFINE THE ALGORITHM ############# # algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum()) algorithm = GradientDescent(cost=costreg, parameters=cg.parameters, step_rule=Adam()) ########### GET THE DATA ##################### istest = 'test' in config.keys() train_stream, valid_stream, test_stream = get_stream(batch_size, image_shape, test=istest) ########### INITIALIZING EXTENSIONS ########## checkpoint = Checkpoint('models/best_' + label + '.tar') checkpoint.add_condition( ['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Adding a live plot with the bokeh server plot = Plot( label, channels=[ ['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], # ['train_costreg','train_grad_norm']], # [ 'train_costreg', 'train_total_gradient_norm', 'train_l2_penalty', 'train_l1_penalty' ] ], server_url="http://hades.calculquebec.ca:5042") grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = 'grad_norm' extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate, error_rate2], valid_stream, prefix="valid"), TrainingDataMonitoring([ costreg, error_rate, error_rate2, grad_norm, l2_penalty, l1_penalty ], prefix="train", after_epoch=True), plot, ProgressBar(), Printing(), TrackTheBest('valid_error_rate', min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4) ] # Early-stopping model = Model(cost) main_loop = MainLoop(algorithm, data_stream=train_stream, model=model, extensions=extensions) main_loop.run()
def main(port_data): mlp_hiddens = [500] filter_sizes = [(3, 3), (3, 3)] feature_maps = [20, 20] pooling_sizes = [(3, 3), (2, 2)] save_to = "DvC.pkl" image_size = (128, 128) output_size = 2 learningRate = 0.1 num_epochs = 300 num_batches = None if socket.gethostname() == 'tim-X550JX': host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (32, 23) batch_size = 50 output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream.default_stream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions( stream, image_size, which_sources=('image_features', )) #stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features',)) stream_max = ScikitResize(stream_downscale, image_size, which_sources=('image_features', )) stream_scale = ScaleAndShift(stream_max, 1. / 255, 0, which_sources=('image_features', )) stream_cast = Cast(stream_scale, dtype='float32', which_sources=('image_features', )) #stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast stream_data_train = create_data( DogsVsCats(('train', ), subset=slice(0, 20000))) stream_data_test = create_data( DogsVsCats(('train', ), subset=slice(20000, 25000))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learningRate)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append( FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append( DataStreamMonitoring([cost, error_rate], stream_data_test, prefix="valid")) extensions.append( TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True)) extensions.append(Checkpoint(save_to)) extensions.append(ProgressBar()) extensions.append(Printing()) model = Model(cost) ########### Loading images ##################### main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def train(model, batch_size=100, num_epochs=1000): cost = model.cost monitorings = model.monitorings # Setting Loggesetr timestr = time.strftime("%Y_%m_%d_at_%H_%M") save_path = 'results/CMV_V2_' + timestr log_path = os.path.join(save_path, 'log.txt') os.makedirs(save_path) fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) # Training blocks_model = Model(cost) all_params = blocks_model.parameters print "Number of found parameters:" + str(len(all_params)) print all_params clipping = StepClipping(threshold=np.cast[floatX](10)) adam = Adam(learning_rate=model.lr_var) step_rule = CompositeRule([adam, clipping]) training_algorithm = GradientDescent(cost=cost, parameters=all_params, step_rule=step_rule) monitored_variables = [ model.lr_var, cost, aggregation.mean(training_algorithm.total_gradient_norm) ] + monitorings blocks_model = Model(cost) params_dicts = blocks_model.get_parameter_dict() for name, param in params_dicts.iteritems(): to_monitor = training_algorithm.gradients[param].norm(2) to_monitor.name = name + "_grad_norm" monitored_variables.append(to_monitor) to_monitor = param.norm(2) to_monitor.name = name + "_norm" monitored_variables.append(to_monitor) train_data_stream, valid_data_stream = get_cmv_v2_streams(batch_size) train_monitoring = TrainingDataMonitoring(variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring(variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=blocks_model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('valid_misclassificationrate_apply_error_rate', blocks_model, save_path), SaveLog(save_path, after_epoch=True), ProgressBar(), LRDecay(model.lr_var, [0.001, 0.0001, 0.00001, 0.000001], [8, 15, 30, 1000], after_epoch=True), Printing() ]) main_loop.run()
def main(args): """Run experiment. """ lr_tag = float_tag(args.learning_rate) x_dim, train_stream, valid_stream, test_stream = datasets.get_streams(args.data, args.batch_size) #------------------------------------------------------------ # Setup model deterministic_act = Tanh deterministic_size = 1. if args.method == 'vae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = VAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'rws': sizes_tag = args.layer_spec.replace(",", "-") name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers( args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = ReweightedWakeSleep( p_layers, q_layers, ) model.initialize() elif args.method == 'bihm': sizes_tag = args.layer_spec.replace(",", "-") name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers( args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = BiHM( p_layers, q_layers, l1reg=args.l1reg, l2reg=args.l2reg, ) model.initialize() elif args.method == 'continue': import cPickle as pickle from os.path import basename, splitext with open(args.model_file, 'rb') as f: m = pickle.load(f) if isinstance(m, MainLoop): m = m.model model = m.get_top_bricks()[0] while len(model.parents) > 0: model = model.parents[0] assert isinstance(model, (BiHM, ReweightedWakeSleep, VAE)) mname, _, _ = basename(args.model_file).rpartition("_model.pkl") name = "%s-cont-%s-lr%s-spl%s" % (mname, args.name, lr_tag, args.n_samples) else: raise ValueError("Unknown training method '%s'" % args.method) #------------------------------------------------------------ x = tensor.matrix('features') #------------------------------------------------------------ # Testset monitoring train_monitors = [] valid_monitors = [] test_monitors = [] for s in [1, 10, 100, 1000,]: log_p, log_ph = model.log_likelihood(x, s) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p_%d" % s log_ph.name = "log_ph_%d" % s #valid_monitors += [log_p, log_ph] test_monitors += [log_p, log_ph] #------------------------------------------------------------ # Z estimation #for s in [100000]: # z2 = tensor.exp(model.estimate_log_z2(s)) / s # z2.name = "z2_%d" % s # # valid_monitors += [z2] # test_monitors += [z2] #------------------------------------------------------------ # Gradient and training monitoring if args.method in ['vae', 'dvae']: log_p_bound = model.log_likelihood_bound(x, args.n_samples) gradients = None log_p_bound = -log_p_bound.mean() log_p_bound.name = "log_p_bound" cost = log_p_bound train_monitors += [log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term')] valid_monitors += [log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term')] test_monitors += [log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term')] else: log_p, log_ph, gradients = model.get_gradients(x, args.n_samples) log_p_bound = named( -model.log_p_bound.mean(), "log_p_bound") log_p = named( -log_p.mean(), "log_p") log_ph = named( -log_ph.mean(), "log_ph") cost = log_p train_monitors += [log_p_bound, log_p, log_ph] valid_monitors += [log_p_bound, log_p, log_ph] #------------------------------------------------------------ cg = ComputationGraph([cost]) if args.step_rule == "momentum": step_rule = Momentum(args.learning_rate, 0.95) elif args.step_rule == "rmsprop": step_rule = RMSProp(args.learning_rate) elif args.step_rule == "adam": step_rule = Adam(args.learning_rate) else: raise "Unknown step_rule %s" % args.step_rule parameters = cg.parameters algorithm = GradientDescent( cost=cost, parameters=parameters, gradients=gradients, step_rule=CompositeRule([ step_rule, ]) ) #------------------------------------------------------------ train_monitors += [aggregation.mean(algorithm.total_gradient_norm), aggregation.mean(algorithm.total_step_norm)] #------------------------------------------------------------ # Live plotting? plotting_extensions = [] if args.live_plotting: plotting_extensions = [ PlotManager( name, [Plotter(channels=[ ["valid_%s" % cost.name, "valid_log_p"], ["train_total_gradient_norm", "train_total_step_norm"]], titles=[ "validation cost", "norm of training gradient and step" ]), DisplayImage([ WeightDisplay( model.p_layers[0].mlp.linear_transformations[0].W, n_weights=100, image_shape=(28, 28))] #ImageDataStreamDisplay(test_stream, image_shape=(28,28))] )] ) ] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[Timing(), ProgressBar(), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=False, after_batch=True), DataStreamMonitoring( valid_monitors, data_stream=valid_stream, prefix="valid"), DataStreamMonitoring( test_monitors, data_stream=test_stream, prefix="test", after_epoch=False, after_training=True, every_n_epochs=10), TrackTheBest('valid_%s' % cost.name), Checkpoint(name+".pkl", save_separately=['log', 'model']), FinishIfNoImprovementAfter('valid_%s_best_so_far' % cost.name, epochs=args.patience), FinishAfter(after_n_epochs=args.max_epochs), Printing()] + plotting_extensions) main_loop.run()
def train(step_rule, input_dim, state_dim, label_dim, layers, epochs, seed, pretrain_alignment, uniform_alignment, dropout, beam_search, test_cost, experiment_path, window_features, features, pool_size, maximum_frames, initialization, weight_noise, to_watch, patience, plot, write_predictions, static_mask, drop_prob, drop_prob_states, drop_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, seq_len, input_drop, augment, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def numpy_rng(random_seed=None): if random_seed == None: random_seed = 1223 return numpy.random.RandomState(random_seed) #from utilities import onehot, unhot, vec2chars # from http://www.iro.umontreal.ca/~memisevr/code/logreg.py #def onehot(x,numclasses=None): #""" Convert integer encoding for class-labels (starting with 0 !) #to one-hot encoding. #The output is an array who's shape is the shape of the input array plus #an extra dimension, containing the 'one-hot'-encoded labels. #""" #if x.shape==(): #x = x[None] #if numclasses is None: #numclasses = x.max() + 1 #result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") #z = numpy.zeros(x.shape, dtype="int") #for c in range(numclasses): #z *= 0 #z[numpy.where(x==c)] = 1 #result[...,c] += z #return result.astype(theano.config.floatX) #framelen = 1 #50 = 50 ##data = np.load(os.path.join(os.environ['FUEL_DATA_PATH'], 'PennTreebankCorpus/char_level_penntree.npz'))#pentree_char_and_word.npz') #data = np.load('char_level_penntree.npz') #trainset = data['train'] #validset = data['valid'] #allletters = " etanoisrhludcmfpkgybw<>\nvN.'xj$-qz&0193#285\\764/*" #dictionary = dict(zip(list(set(allletters)), range(50))) #invdict = {v: k for k, v in dictionary.items()} #numtrain = len(trainset) / seq_len * seq_len #numvalid = len(validset) / seq_len * seq_len #trainset = trainset[:numtrain] #validset = validset[:numvalid] ##if testing: ## train_features_numpy = train_features_numpy[:32 * 5] ## valid_features_numpy = valid_features_numpy[:100] #train_targets = trainset.reshape(-1, seq_len*framelen)[:,1:] #valid_targets = validset.reshape(-1, seq_len*framelen)[:,1:] ## still only 2d (b, t*n) #train_features_numpy = onehot(trainset).reshape(-1, 50*seq_len*framelen)[:,:-50] #valid_features_numpy = onehot(validset).reshape(-1, 50*seq_len*framelen)[:,:-50] #del trainset, validset #data_loaded = True #print '... done' #test_value = train_features_numpy[:32] #################### ########################################### # # MAKE STREAMS # ########################################### rng = np.random.RandomState(seed) stream_args = dict(rng=rng, pool_size=pool_size, maximum_frames=maximum_frames, pretrain_alignment=pretrain_alignment, uniform_alignment=uniform_alignment, window_features=window_features) if share_mask: drop_prob_cells = drop_prob # we don't want to actually use these masks, so this is to debug drop_prob_states = None # the threes in here are because the number of layers is hardcoded to 3 atm. NIPS! print '.. initializing iterators' # train_stream, valid_stream = get_seq_mnist_streams( # h_dim, batch_size, update_prob) if static_mask: train_stream = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_static_mask_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) else: train_stream = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) #train_dataset = Timit('train', features=features) # assert (train_features_numpy[:,-50:].sum(axis=-2)==1).all() #train_features_numpy = train_features_numpy.reshape(-1, seq_len-1, 50)#BTN for shuffled dataset? #train_dataset = IndexableDataset(indexables=OrderedDict( #[('features', train_features_numpy), #('outputs', train_targets)])) #train_stream = construct_stream_np(train_dataset, state_dim, batch_size, len(train_targets), #drop_prob_states, drop_prob_cells, drop_prob_igates, #num_layers=num_layers, #is_for_test=False, stoch_depth=stoch_depth, share_mask=share_mask, #gaussian_drop=gaussian_drop, input_drop=input_drop, **stream_args) ##dev_dataset = Timit('dev', features=features) #valid_features_numpy = valid_features_numpy.reshape(-1, seq_len-1, 50) #dev_dataset = IndexableDataset(indexables=OrderedDict( #[('features', valid_features_numpy), #('outputs', valid_targets)])) #dev_stream = construct_stream_np(dev_dataset, state_dim, batch_size, len(valid_targets), #drop_prob_states, drop_prob_cells, drop_prob_igates, #num_layers=num_layers, #is_for_test=True, stoch_depth=stoch_depth, share_mask=share_mask, #gaussian_drop=gaussian_drop, input_drop=input_drop, **stream_args) ##test_dataset = Timit('test', features=features) ##test_stream = construct_stream(test_dataset, state_dim, drop_prob_states, drop_prob_cells, drop_prob_igates, 3, ## is_for_test=True, stoch_depth=stoch_depth, share_mask=share_mask, ## gaussian_drop=gaussian_drop, **stream_args) data = train_stream.get_epoch_iterator(as_dict=True).next() #import ipdb; ipdb.set_trace() #phone_dict = train_dataset.get_phoneme_dict() #phoneme_dict = {k: phone_to_phoneme_dict[v] # if v in phone_to_phoneme_dict else v # for k, v in phone_dict.iteritems()} #ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()} #eol_symbol = ind_to_phoneme['<STOP>'] #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('features', dtype=floatX) x, y = x[:-1], x[1:] #T.lmatrix('outputs')# phonemes') drops_states = T.tensor3('drops_states') drops_cells = T.tensor3('drops_cells') drops_igates = T.tensor3('drops_igates') x.tag.test_value = data['features'] #y.tag.test_value = data['outputs'] drops_states.tag.test_value = data['drops_states'] drops_cells.tag.test_value = data['drops_cells'] drops_igates.tag.test_value = data['drops_igates'] if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=.2) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hid = Linear(50, state_dim * 4, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = DropLSTM(dim=state_dim, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn', ogates_zoneout=ogates_zoneout) elif rnn_type.lower() == 'gru': in_to_hid = Linear(50, state_dim * 3, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = DropGRU(dim=state_dim, weights_init=weights_init, activation=Tanh(), name='rnn') elif rnn_type.lower() == 'srnn': #FIXME!!! make ReLU in_to_hid = Linear(50, state_dim, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = DropSimpleRecurrent(dim=state_dim, weights_init=weights_init, activation=Rectifier(), name='rnn') else: raise NotImplementedError #lstm2 = DropLSTM(dim=state_dim, activation=Tanh(), model_type=6) #lstm3 = DropLSTM(dim=state_dim, activation=Tanh(), model_type=6) #encoder = DropMultiLayerEncoder(weights_init=weights_init, #biases_init=Constant(.0), #networks=[lstm1, lstm2, bidir3], #dims=[input_dim * window_features, #state_dim, #state_dim, #state_dim, #label_dim + 1]) #encoder.initialize() #drops_states = [drops_forw_states, drops_back_states] #drops_cells = [drops_forw_cells, drops_back_cells] #drops_igates = [drops_forw_igates, drops_back_igates] hid_to_out = Linear(state_dim, 50, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) in_to_hid.initialize() recurrent_layer.initialize() hid_to_out.initialize() h = in_to_hid.apply(x) if rnn_type.lower() == 'lstm': yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates)[0] else: yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates) y_hat_pre_softmax = hid_to_out.apply(yh) shape_ = y_hat_pre_softmax.shape # y_hat = Softmax().apply( # y_hat_pre_softmax.reshape((-1, shape_[-1])))# .reshape(shape_) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### # cost = CategoricalCrossEntropy().apply(y.flatten().astype('int64'), y_hat) def crossentropy_lastaxes(yhat, y): # for sequence of distributions/targets return -(y * T.log(yhat)).sum(axis=yhat.ndim - 1) def softmax_lastaxis(x): # for sequence of distributions return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) yhat = softmax_lastaxis(y_hat_pre_softmax) cross_entropies = crossentropy_lastaxes(yhat, y) cross_entropy = cross_entropies.mean().copy(name="cross_entropy") cost = cross_entropy.copy(name="cost") batch_cost = cost.copy(name='batch_cost') nll_cost = cost.copy(name='nll_cost') bpc = (nll_cost / np.log(2.0)).copy(name='bpr') #nll_cost = aggregation.mean(batch_cost, batch_size).copy(name='nll_cost') cost_monitor = aggregation.mean( batch_cost, batch_size).copy(name='sequence_cost_monitor') cost_per_character = aggregation.mean( batch_cost, (seq_len - 1) * batch_size).copy(name='character_cost') cost_train = cost.copy(name='train_batch_cost') cost_train_monitor = cost_monitor.copy('train_batch_cost_monitor') cg_train = ComputationGraph([cost_train, cost_train_monitor]) ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) ## debugging nans stuff #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore') #grf = theano.function([x, input_mask], gr) #grz = grf(x.tag.test_value, input_mask.tag.test_value) #params = cg_train.parameters #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))] #for mm in mynanz: print mm ##import ipdb; ipdb.set_trace() elif penalty == 'hids': assert 'rnn_apply_states' in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): if output.name == 'rnn_apply_states': norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) ## debugging nans stuff #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore') #grf = theano.function([x, input_mask], gr) #grz = grf(x.tag.test_value, input_mask.tag.test_value) #params = cg_train.parameters #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))] #for mm in mynanz: print mm ##import ipdb; ipdb.set_trace() norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? cg_train = ComputationGraph([cost_train, cost_train_monitor]) #, norm_cost]) ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## ##################### DK ADD COST ######################## if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') cost_train_monitor = cg_train.outputs[1].copy( 'train_batch_cost_monitor') # if 'l2regularization' in kwargs: # weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) # cost_train += kwargs['l2regularization'] * sum([ # (weight ** 2).sum() for weight in weights]) # cost_train.name = 'cost_train' # cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train_monitor, (seq_len - 1) * batch_size).copy(name='train_character_cost') algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) observed_vars = [ cost_train, cost_train_monitor, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] # parameters = model.get_parameter_dict() # for name, param in parameters.iteritems(): # observed_vars.append(param.norm(2).copy(name=name + "_norm")) # observed_vars.append( # algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring(variables=[nll_cost, bpc], data_stream=dev_stream, prefix="dev") #train_ctc_monitor = CTCMonitoring( #x, input_mask, #drops_forw_states, drops_forw_cells, drops_forw_igates, #drops_back_states, drops_back_cells, drops_back_igates, #y_hat, eol_symbol, train_stream, #prefix='train', every_n_epochs=1, #before_training=True, #phoneme_dict=phoneme_dict, #black_list=black_list, train=True) #dev_ctc_monitor = CTCMonitoring( #x, input_mask, #drops_forw_states, drops_forw_cells, drops_forw_igates, #drops_back_states, drops_back_cells, drops_back_igates, #y_hat, eol_symbol, dev_stream, #prefix='dev', every_n_epochs=1, #phoneme_dict=phoneme_dict, #black_list=black_list) extensions = [] # /u/pezeshki/speech_project/five_layer_timit/trained_params_best.npz if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name #_evaluator = CTCEvaluator(eol_symbol, x, input_mask, y_hat, #phoneme_dict=phoneme_dict, #black_list=black_list) #logger.info("CTC monitoring on TEST data started") #value_dict = _evaluator.evaluate(test_stream, False) #print value_dict.items() #logger.info("CTC monitoring on TEST data finished") #logger.info("CTC monitoring on TRAIN data started") #value_dict = _evaluator.evaluate(train_stream, True) #print value_dict.items() #logger.info("CTC monitoring on TRAIN data finished") #logger.info("CTC monitoring on DEV data started") #value_dict = _evaluator.evaluate(dev_stream, False) #print value_dict.items() #logger.info("CTC monitoring on DEV data finished") extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) #train_ctc_monitor, #dev_ctc_monitor]) if test_cost: test_monitor = DataStreamMonitoring( variables=[cost_monitor, cost_per_character], data_stream=test_stream, prefix="test") extensions.append(test_monitor) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_nll_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run() print "Execution time: %f" % (time.time() - t1)
def main(mode, save_path, num_batches, data_path=None): # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Build bricks encoder = Bidirectional(SimpleRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) fork = Fork( [name for name in encoder.prototype.apply.sequences if name != 'mask'], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.output_dims = {name: dimension for name in fork.input_names} lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention(state_names=transition.apply.states, sequence_dim=2 * dimension, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = DataStreamMapping( mapping=_transpose, data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=_filter_long, data_stream=dataset.get_default_stream()))))) # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) (energies, ) = VariableFilter(application=readout.readout, name="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy( abs(activations).mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # More variables for debugging observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) model = Model(generated) model.set_param_values(load_parameter_values(save_path)) sample_function = model.get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel, live_plotting): image_size, channels, data_train, data_valid, data_test = datasets.get_data(dataset) train_stream = Flatten(DataStream.default_stream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size))) valid_stream = Flatten(DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size))) test_stream = Flatten(DataStream.default_stream(data_test, iteration_scheme=SequentialScheme(data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = channels * img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # Configure attention mechanism if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * channels * read_N ** 2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, channels=channels, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, channels=channels, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- if name is None: name = dataset # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S"); longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') x_recons, kl_terms = draw.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) plotting_extensions = [] if live_plotting: plotting_extensions = [ Plot(name, channels=plot_channels) ] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), #Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']), PartsOnlyCheckpoint("{}/{}".format(subdir,name), before_training=True, after_epoch=True, save_separately=['log', 'model']), SampleCheckpoint(image_size=image_size[0], channels=channels, save_subdir=subdir, before_training=True, after_epoch=True), ProgressBar(), Printing()] + plotting_extensions) if oldmodel is not None: print("Initializing parameters with old model %s"%oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def _pokemon_wgan_gp(): import os os.environ["FUEL_DATA_PATH"] = os.getcwd() + "/data/" batch_size = 20 data_train = PokemonGenYellowNormal(which_sets=['train'], sources=['features']) train_stream = Flatten(DataStream.default_stream( data_train, iteration_scheme=SequentialScheme( data_train.num_examples, batch_size))) features_size = 56 * 56 * 1 inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.) } # print train_stream.get_epoch_iterator(as_dict=True).next() # raise inputs = T.matrix('features') inputs = ((inputs / 255.) * 2. - 1.) rng = MRG_RandomStreams(123) prior = Z_prior(dim=512) gen = Generator(input_dim=512, dims=[512, 512, 512, 512, features_size], alpha=0.1, **inits) dis = Discriminator(dims=[features_size, 512, 512 , 512, 512], alpha=0.1, **inits) gan = GAN(dis=dis, gen=gen, prior=prior) gan.initialize() # gradient penalty fake_samples, _ = gan.sampling(inputs.shape[0]) e = rng.uniform(size=(inputs.shape[0], 1)) mixed_input = (e * fake_samples) + (1 - e) * inputs output_d_mixed = gan._dis.apply(mixed_input) grad_mixed = T.grad(T.sum(output_d_mixed), mixed_input) norm_grad_mixed = T.sqrt(T.sum(T.square(grad_mixed), axis=1)) grad_penalty = T.mean(T.square(norm_grad_mixed -1)) y_hat1, y_hat0, z = gan.apply(inputs) d_loss_real = y_hat1.mean() d_loss_fake = y_hat0.mean() d_loss = - d_loss_real + d_loss_fake + 10 * grad_penalty g_loss = - d_loss_fake dis_obj = d_loss gen_obj = g_loss model = Model([y_hat0, y_hat1]) em_loss = -d_loss_real + d_loss_fake em_loss.name = "Earth Move loss" dis_obj.name = 'Discriminator loss' gen_obj.name = 'Generator loss' cg = ComputationGraph([gen_obj, dis_obj]) gen_filter = VariableFilter(roles=[PARAMETER], bricks=gen.linear_transformations) dis_filter = VariableFilter(roles=[PARAMETER], bricks=dis.linear_transformations) gen_params = gen_filter(cg.variables) dis_params = dis_filter(cg.variables) # Prepare the dropout _inputs = [] for brick_ in [gen]: _inputs.extend(VariableFilter(roles=[INPUT], bricks=brick_.linear_transformations)(cg.variables)) cg_dropout = apply_dropout(cg, _inputs, 0.02) gen_obj = cg_dropout.outputs[0] dis_obj = cg_dropout.outputs[1] gan.dis_params = dis_params gan.gen_params = gen_params # gradient penalty algo = AdverserialTraning(gen_obj=gen_obj, dis_obj=dis_obj, model=gan, dis_iter=5, gradient_clip=None, step_rule=RMSProp(learning_rate=1e-4), gen_consider_constant=z) neg_sample = gan.sampling(size=25) from blocks.monitoring.aggregation import mean monitor = TrainingDataMonitoring(variables=[mean(gen_obj), mean(dis_obj), mean(em_loss)], prefix="train", after_batch=True) subdir = './exp/' + 'pokemon-wgan-gp' + "-" + time.strftime("%Y%m%d-%H%M%S") check_point = Checkpoint("{}/{}".format(subdir, 'CIFAR10'), every_n_epochs=100, save_separately=['log', 'model']) neg_sampling = GenerateNegtiveSample(neg_sample, img_size=(25, 56, 56), every_n_epochs=10) if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop(algorithm=algo, model=model, data_stream=train_stream, extensions=[Printing(), ProgressBar(), monitor, check_point, neg_sampling]) main_loop.run()
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def add_norm_grads_vars(self): gradient_norm = aggregation.mean(self.algorithm.total_gradient_norm) step_norm = aggregation.mean(self.algorithm.total_step_norm) grad_over_step = gradient_norm / step_norm grad_over_step.name = 'grad_over_step' self.add_monitored_vars([gradient_norm, step_norm, grad_over_step])
def run_pretrain(model, hyper_params, cost, train_data, valid_data=None, extra_costs=None): """ generic training method for neural networks; works with any network structure :return: """ from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme from blocks.filter import VariableFilter from blocks.graph import ComputationGraph from blocks.roles import WEIGHT from blocks.algorithms import GradientDescent, Adam, RMSProp, Scale from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.predicates import OnLogRecord from blocks.monitoring import aggregation from blocks.main_loop import MainLoop from blocks.extensions.training import TrackTheBest from deepthought.extensions.parameters import BestParams if extra_costs is None: extra_costs = [] cg = ComputationGraph([cost]) # TODO: more hyper-params for regularization # L1 regularization if hyper_params['l1wdecay'] > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + hyper_params['l1wdecay'] * sum([abs(W).sum() for W in weights]) cost.name = 'cost' # set up step_rule if hyper_params['step_rule'] == 'Adam': step_rule = Adam(learning_rate=hyper_params['learning_rate']) elif hyper_params['step_rule'] == 'RMSProp': step_rule = RMSProp(learning_rate=hyper_params['learning_rate']) #, decay_rate=0.9, max_scaling=1e5) else: step_rule = Scale(learning_rate=hyper_params['learning_rate']) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) if 'blocks_print_variable_names' in hyper_params and hyper_params['blocks_print_variable_names']: print 'cg.variables:', cg.variables train_monitoring_vars = [cost] + extra_costs + [aggregation.mean(algorithm.total_gradient_norm)] for var_name in hyper_params['blocks_extensions_train_monitoring_channels']: for v in cg.variables: if v.name == var_name: print 'Monitoring variable:', v train_monitoring_vars.append(v) # default extensions extensions = [Timing(), FinishAfter(after_n_epochs=hyper_params['max_epochs']), TrainingDataMonitoring( train_monitoring_vars, suffix="train", after_epoch=True) ] # additional stuff if validation set is used if valid_data is not None: valid_monitoring_vars = [cost] + extra_costs for var_name in hyper_params['blocks_extensions_valid_monitoring_channels']: for v in cg.variables: if v.name == var_name: print 'Monitoring variable:', v valid_monitoring_vars.append(v) extensions.append( DataStreamMonitoring( valid_monitoring_vars, DataStream.default_stream( valid_data, iteration_scheme=SequentialScheme( valid_data.num_examples, hyper_params['batch_size'])), suffix="valid")) best_channel = 'cost_valid' print '#train:', train_data.num_examples, '#valid:', valid_data.num_examples else: best_channel = 'cost_train' print '#train:', train_data.num_examples # tracking of the best best_params = BestParams() best_params.add_condition(['after_epoch'], predicate=OnLogRecord(best_channel + '_best_so_far')) extensions.append(TrackTheBest(best_channel)) extensions.append(best_params) # after TrackTheBest! # printing and plotting if hyper_params['blocks_extensions_printing'] is True: extensions.append(Printing()) # optional if hyper_params['blocks_extensions_progressbar'] is True: extensions.append(ProgressBar()) if hyper_params['blocks_extensions_bokeh'] is True: try: from blocks_extras.extensions.plot import Plot bokeh_available = True except: bokeh_available = False print 'bokeh available: ', bokeh_available if bokeh_available: extensions.append(Plot( hyper_params['blocks_extensions_bokeh_plot_title'], channels=hyper_params['blocks_extensions_bokeh_channels'], )) main_loop = MainLoop( algorithm, DataStream.default_stream( train_data, iteration_scheme=ShuffledScheme( train_data.num_examples, hyper_params['batch_size'])), model=model, extensions=extensions) main_loop.run() return best_params.values, main_loop.status['best_' + best_channel]
def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None): networkfile = self._config['net'] n_epochs = num_epochs or int(self._config['nepochs']) reg_weight = float(self._config['loss_weight']) reg_type = self._config['loss_reg'] numtrain = int( self._config['num_train']) if 'num_train' in self._config else None train_stream, num_samples_train = get_comb_stream( fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False) logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev) t2idx = fea2obj['targets'].t2idx klw_init = kl_weight_init or float( self._config['kld_weight']) if 'kld_weight' in self._config else 1 logger.info('kl_weight_init: %d', klw_init) kl_weight = shared_floatx(klw_init, 'kl_weight') entropy_weight = shared_floatx(1., 'entropy_weight') cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new( fea2obj, len(t2idx), self._config, kl_weight, entropy_weight) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg.parameters) logger.info('Model weights are: %s', weights) if 'L2' in reg_type: cost += reg_weight * l2_norm(weights) logger.info('applying %s with weight: %f ', reg_type, reg_weight) dropout = -0.1 if dropout > 0: cg = apply_dropout(cg, weights, dropout) cost = cg.outputs[0] cost.name = 'cost' logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate) if 'adagrad' in steprule: cnf_step_rule = AdaGrad(learning_rate) elif 'adadelta' in steprule: cnf_step_rule = AdaDelta(decay_rate=0.95) elif 'decay' in steprule: cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90) cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)]) elif 'momentum' in steprule: cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9) elif 'adam' in steprule: cnf_step_rule = Adam(learning_rate=learning_rate) else: logger.info('The steprule param is wrong! which is: %s', steprule) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn') #algorithm.add_updates(updates) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog ] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') dev_monitor = DataStreamMonitoring(variables=[ cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate ], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") extensions = [ dev_monitor, train_monitor, Timing(), TrackTheBest('dev_cost'), FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs), Printing(after_batch=False), #, ProgressBar() FinishAfter(after_n_epochs=n_epochs), saveload.Load(networkfile + '.toload.pkl'), ] + track_best('dev_cost', networkfile + '.best.pkl') #extensions.append(SharedVariableModifier(kl_weight, # lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) # extensions.append(SharedVariableModifier(entropy_weight, # lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval()) logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name]) main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
def train(model, get_streams, save_path, num_epochs, batch_size, lrs, until_which_epoch, grad_clipping): monitorings = model.monitorings # Training blocks_model = Model(model.cost) all_params = blocks_model.parameters print "Number of found parameters:" + str(len(all_params)) print all_params default_lr = np.float32(1e-4) lr_var = theano.shared(default_lr, name="learning_rate") clipping = StepClipping(threshold=np.cast[floatX](grad_clipping)) # sgd_momentum = Momentum( # learning_rate=0.0001, # momentum=0.95) # step_rule = CompositeRule([clipping, sgd_momentum]) adam = Adam(learning_rate=lr_var) step_rule = CompositeRule([clipping, adam]) training_algorithm = GradientDescent( cost=model.cost, parameters=all_params, step_rule=step_rule) monitored_variables = [ lr_var, aggregation.mean(training_algorithm.total_gradient_norm)] + monitorings for param in all_params: name = param.name to_monitor = training_algorithm.gradients[param].norm(2) to_monitor.name = name + "_grad_norm" monitored_variables.append(to_monitor) to_monitor = param.norm(2) to_monitor.name = name + "_norm" monitored_variables.append(to_monitor) train_data_stream, valid_data_stream = get_streams(batch_size) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=blocks_model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('valid_CE', blocks_model, save_path), SaveLog(save_path, after_epoch=True), ProgressBar(), LRDecay(lr_var, lrs, until_which_epoch, after_epoch=True), Printing(after_epoch=True)]) main_loop.run()
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator(Readout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean( generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[ FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100) ]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost') error_rate = MisclassificationRate().apply(y.flatten(), probs).copy( name='error_rate') cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run()
def train(step_rule, label_dim, state_dim, epochs, seed, dropout, test_cost, experiment_path, features, weight_noise, to_watch, patience, batch_size, batch_norm, **kwargs): print '.. TIMIT experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() # ------------------------------------------------------------------------ # Streams rng = np.random.RandomState(seed) stream_args = dict(rng=rng, batch_size=batch_size) print '.. initializing iterators' train_dataset = Timit('train', features=features) train_stream = construct_stream(train_dataset, **stream_args) dev_dataset = Timit('dev', features=features) dev_stream = construct_stream(dev_dataset, **stream_args) test_dataset = Timit('test', features=features) test_stream = construct_stream(test_dataset, **stream_args) update_stream = construct_stream(train_dataset, n_batches=100, **stream_args) phone_dict = train_dataset.get_phoneme_dict() phoneme_dict = { k: phone_to_phoneme_dict[v] if v in phone_to_phoneme_dict else v for k, v in phone_dict.iteritems() } ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()} eol_symbol = ind_to_phoneme['<STOP>'] # ------------------------------------------------------------------------ # Graph print '.. building model' x = T.tensor3('features') y = T.matrix('phonemes') input_mask = T.matrix('features_mask') output_mask = T.matrix('phonemes_mask') theano.config.compute_test_value = 'off' x.tag.test_value = np.random.randn(100, 24, 123).astype(floatX) y.tag.test_value = np.ones((30, 24), dtype=floatX) input_mask.tag.test_value = np.ones((100, 24), dtype=floatX) output_mask.tag.test_value = np.ones((30, 24), dtype=floatX) seq_len = 100 input_dim = 123 activation = Tanh() recurrent_init = IdentityInit(0.99) rec1 = TimLSTM(not batch_norm, input_dim, state_dim, activation, name='LSTM') rec1.initialize() l1 = Linear(state_dim, label_dim + 1, name='out_linear', weights_init=Orthogonal(), biases_init=Constant(0.0)) l1.initialize() o1 = rec1.apply(x) y_hat_o = l1.apply(o1) shape = y_hat_o.shape y_hat = Softmax().apply(y_hat_o.reshape((-1, shape[-1]))).reshape(shape) y_mask = output_mask y_hat_mask = input_mask # ------------------------------------------------------------------------ # Costs and Algorithm ctc_cost = T.sum( ctc.cpu_ctc_th(y_hat_o, T.sum(y_hat_mask, axis=0), y + T.ones_like(y), T.sum(y_mask, axis=0))) batch_cost = ctc_cost.copy(name='batch_cost') bs = y.shape[1] cost_train = aggregation.mean(batch_cost, bs).copy("sequence_cost") cost_per_character = aggregation.mean( batch_cost, output_mask.sum()).copy("character_cost") cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train, output_mask.sum()).copy("train_character_cost") algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters, on_unused_sources='warn') # ------------------------------------------------------------------------ # Monitoring and extensions parameters = model.get_parameter_dict() observed_vars = [ cost_train, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring( variables=[cost_train, cost_per_character], data_stream=dev_stream, prefix="dev") train_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, train_stream, prefix='train', every_n_epochs=1, before_training=True, phoneme_dict=phoneme_dict, black_list=black_list, train=True) dev_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, dev_stream, prefix='dev', every_n_epochs=1, phoneme_dict=phoneme_dict, black_list=black_list) extensions = [] if 'load_path' in kwargs: extensions.append(Load(kwargs['load_path'])) extensions.extend([ FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor, train_ctc_monitor, dev_ctc_monitor ]) if test_cost: test_monitor = DataStreamMonitoring( variables=[cost_train, cost_per_character], data_stream=test_stream, prefix="test") test_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, test_stream, prefix='test', every_n_epochs=1, phoneme_dict=phoneme_dict, black_list=black_list) extensions.append(test_monitor) extensions.append(test_ctc_monitor) #if not os.path.exists(experiment_path): # os.makedirs(experiment_path) #best_path = os.path.join(experiment_path, 'best/') #if not os.path.exists(best_path): # os.mkdir(best_path) #best_path = os.path.join(best_path, 'model.bin') extensions.append(EarlyStopping(to_watch, patience, '/dev/null')) extensions.extend([ProgressBar(), Printing()]) # ------------------------------------------------------------------------ # Main Loop main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) print "Building time: %f" % (time.time() - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run()
# FIXME hard-coded for 3-layered LSTM modifier_functions = { network.transitions[0].name : modifier_function_0, network.transitions[1].name : modifier_function_1, network.transitions[2].name : modifier_function_2 } #init_state_modifier = SharedVariableModifier(network.transitions[-1].initial_state_, function=modifier_function, after_batch=True) init_state_modifiers = [SharedVariableModifier(trans.initial_state_, function=modifier_functions[trans.name], after_batch=True) for trans in network.transitions] #state_function = function([state_to_compare], initial_states[2], updates=[(init_state_2, state_to_compare[0][-1])]) #TODO look at this, this is how it basically works! monitor_grad = TrainingDataMonitoring(variables=[cross_ent, aggregation.mean(algorithm.total_gradient_norm), aggregation.mean(algorithm.total_step_norm)], #+initial_states+[state_to_compare_1], prefix="training", after_batch=True) early_stopping = EarlyStopping(variables=[cross_ent], data_stream=data_stream_valid, path="seqgen_" + args.type + "_" + "_".join([str(d) for d in network.hidden_dims]) + ".pkl", tolerance=4, prefix="validation") prkwargs = { #'after_batch':True # use this for prints after every batch } main_loop = MainLoop(algorithm=algorithm, data_stream=data_stream, model=cost_model, extensions=[monitor_grad, early_stopping, FinishAfter(after_n_epochs=args.epochs), ProgressBar(), Timing(), Printing(**prkwargs)]+init_state_modifiers)
def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.decimator.bidir_w.prototype.recurrent.weights_init = Orthogonal() for layer_n in range(config['src_dgru_depth']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['bidir_encoder_depth']): encoder.children[1 + layer_n].prototype.recurrent.weights_init = Orthogonal() if config['trg_igru_depth'] == 1: decoder.interpolator.igru.weights_init = Orthogonal() else: for layer_n in range(config['trg_igru_depth']): decoder.interpolator.igru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['trg_dgru_depth']): decoder.interpolator.feedback_brick.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['transition_depth']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") # You could use 1e-4 in Adam, however manually decay will be faster. # We decay it to 5e-4 when trained for about 30K # then decay it to 2e-4 when trained for about 90K # finally set it to 1e-4 when trained for about 180K algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), Adam(learning_rate=1e-3)])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(every_n_batches=config['print_freq']), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(saveto=config['saveto'], dump_freq=config['dump_freq'], every_n_batches=config['save_freq'], )] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_depth']])) # generated[transition_depth] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_depth=config['transition_depth'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def run(): # Load Model net_size = 256 #Hard-code instead of loading model (takes too long to set up network) #net = vaegan.VAEGAN() #network_saver = saver.NetworkSaver('vaegan/models/', net=net) #network_saver.load() # DATA train_stream = get_stream(hdf5_file, 'train', batch_size) #TODO jonathan ? test_stream = get_stream(hdf5_file, 'test', batch_size) #TODO jonathan ? # MODEL x = T.TensorType('floatX', [False] * 3)('features') y = T.tensor3('targets', dtype='floatX') train_flag = [theano.shared(0)] x = x.swapaxes(0, 1) y = y.swapaxes(0, 1) # More Config out_size = len(output_columns) - 1 # code_mode=RL-MDN latent_size = net_size in_size = latent_size + len(input_columns) # NN fprop y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size, num_recurrent_layers, train_flag) # COST cg = ComputationGraph(cost) extra_updates = [] # RMS Prop training optimizer step_rules = [ RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping) ] parameters_to_update = cg.parameters algorithm = GradientDescent(cost=cg.outputs[0], parameters=parameters_to_update, step_rule=CompositeRule(step_rules)) algorithm.add_updates( extra_updates) # TODO jonathan what is this, is this needed? # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, step_rules[0].learning_rate, gradient_norm, step_norm ] test_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=test_stream, prefix="test") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_epoch=True, before_first_epoch=True, prefix='train') set_train_flag = SetTrainFlag(after_epoch=True, before_epoch=True, flag=train_flag) # plot = Plot('Plotting example', channels=[['cost']], after_batch=True, open_browser=True) extensions = [ set_train_flag, test_monitor, train_monitor, Timing(), Printing(after_epoch=True), FinishAfter(after_n_epochs=nepochs), saveload.Load(load_path), saveload.Checkpoint(last_path, every_n_epochs=10000), ] + track_best('test_cost', save_path) #+ track_best('train_cost', last_path) if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=False, every_n_epochs=lr_decay_every_n_epochs, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model mainLoop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) mainLoop.run()
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel, live_plotting): image_size, channels, data_train, data_valid, data_test = datasets.get_data( dataset) train_stream = Flatten( DataStream.default_stream(data_train, iteration_scheme=SequentialScheme( data_train.num_examples, batch_size))) valid_stream = Flatten( DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme( data_valid.num_examples, batch_size))) test_stream = Flatten( DataStream.default_stream(data_test, iteration_scheme=SequentialScheme( data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = channels * img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # Configure attention mechanism if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * channels * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, channels=channels, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, channels=channels, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- if name is None: name = dataset # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S") longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % ( dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') x_recons, kl_terms = draw.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) plotting_extensions = [] if live_plotting: plotting_extensions = [Plot(name, channels=plot_channels)] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), #Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']), Checkpoint("{}/{}".format(subdir, name), save_main_loop=False, before_training=True, after_epoch=True, save_separately=['log', 'model']), SampleCheckpoint(image_size=image_size[0], channels=channels, save_subdir=subdir, before_training=True, after_epoch=True), ProgressBar(), Printing() ] + plotting_extensions) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_parameter_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def test_training_data_monitoring(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [ numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 5], [5, 8]] ] targets = numpy.array([(weights * f).sum() for f in features]) n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') V = shared_floatx(7, name='V') W_sum = W.sum().copy(name='W_sum') cost = ((x * W).sum() - y)**2 cost.name = 'cost' class TrueCostExtension(TrainingExtension): def before_batch(self, data): self.main_loop.log.current_row['true_cost'] = (( (W.get_value() * data["features"]).sum() - data["targets"])**2) # Note, that unlike a Theano variable, a monitored # quantity can't be reused in more than one TrainingDataMonitoring ftt1 = MeanFeaturesTimesTarget(requires=[x, y], name='ftt1') ftt2 = MeanFeaturesTimesTarget(requires=[x, y], name='ftt2') main_loop = MainLoop(model=None, data_stream=dataset.get_example_stream(), algorithm=GradientDescent(cost=cost, parameters=[W], step_rule=Scale(0.001)), extensions=[ FinishAfter(after_n_epochs=1), TrainingDataMonitoring([W_sum, cost, V, ftt1], prefix="train1", after_batch=True), TrainingDataMonitoring( [aggregation.mean(W_sum), cost, ftt2], prefix="train2", after_epoch=True), TrueCostExtension() ]) main_loop.run() # Check monitoring of a shared varible assert_allclose(main_loop.log.current_row['train1_V'], 7.0) for i in range(n_batches): # The ground truth is written to the log before the batch is # processed, where as the extension writes after the batch is # processed. This is why the iteration numbers differs here. assert_allclose(main_loop.log[i]['true_cost'], main_loop.log[i + 1]['train1_cost']) assert_allclose( main_loop.log[n_batches]['train2_cost'], sum([main_loop.log[i]['true_cost'] for i in range(n_batches)]) / n_batches) assert_allclose( main_loop.log[n_batches]['train2_W_sum'], sum([ main_loop.log[i]['train1_W_sum'] for i in range(1, n_batches + 1) ]) / n_batches) # Check monitoring of non-Theano quantites for i in range(n_batches): assert_allclose(main_loop.log[i + 1]['train1_ftt1'], features[i] * targets[i]) assert_allclose(main_loop.log[n_batches]['train2_ftt2'], (features * targets[:, None]).mean(axis=0))
def main(dataset): #---------------------------------------------------------------------------- epochs = 50 batch_size = 200 learning_rate = 3e-4 attention = '16,16' n_iter = 8 enc_dim = 1024 dec_dim = 1024 z_dim = 100 oldmodel = None #dataset = 'sketch' data_dir = '/home/ubuntu/svrt_data/'+dataset name = dataset #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # image_size, channels, data_train, data_valid, data_test = datasets.get_data(dataset) # train_ind = np.arange(data_train.num_examples) # test_ind = np.arange(data_test.num_examples) # rng = np.random.RandomState(seed=1) # rng.shuffle(train_ind) # rng.shuffle(test_ind) # train_stream = Flatten(DataStream.default_stream( # data_train, iteration_scheme=ShuffledScheme(train_ind, batch_size))) # test_stream = Flatten(DataStream.default_stream( # data_test, iteration_scheme=ShuffledScheme(test_ind, batch_size))) #Get shuffled data test_X, train_X, test_y, train_y = package_sketch_images.import_sketch(data_dir) data_test = package_sketch_images.assign_datastream(test_X,test_y) data_train = package_sketch_images.assign_datastream(train_X,train_y) image_size = (int(np.sqrt(test_X.shape[1])),int(np.sqrt(test_X.shape[1]))) channels = 1 target_categories = np.unique(train_y).shape[0] train_ind = np.arange(data_train.num_examples) test_ind = np.arange(data_test.num_examples) rng = np.random.RandomState(seed=1) rng.shuffle(train_ind) rng.shuffle(test_ind) ##### #Comparisons to humans: #Is there a neural signature for changes in read delta parameter (glimpse size)? #Do machines/humans make similar mistakes? #Learning time::: compare this somehow... ##### #Convert datasets into fuel #valid_stream = Flatten(DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size))) test_stream = Flatten(DataStream.default_stream(data_test, iteration_scheme=ShuffledScheme(test_ind, batch_size))) train_stream = Flatten(DataStream.default_stream(data_train, iteration_scheme=ShuffledScheme(train_ind, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = channels * img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # Configure attention mechanism if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * channels * read_N ** 2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, channels=channels, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, channels=channels, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- if name is None: name = dataset # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = name longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) #///// #Insert a conv/deconv before the encoder MLP? -- add normalization at some point conv_layer = Convolutional( filter_size=(3, 3), num_filters=30, border_mode='half', step=(1,1)) act = Rectifier() pool_layer = MaxPooling( pooling_size=(2, 2), step=(1,1), padding=(1,1)) encoder_cnn = ConvolutionalSequence( [ conv_layer, act, pool_layer, ], num_channels=1, image_size=(read_N, read_N), **inits) dummy_cnn = encoder_cnn dummy_cnn.initialize() cnn_output_dim = np.prod(dummy_cnn.get_dim('output')) #Take product now so that you can flatten later cnn_mlp = MLP([Identity()], [cnn_output_dim, read_N ** 2],name="CNN_encoder", **inits) #convert CNN feature maps to encoder_mlp dimensions flattener = Flattener() #///// encoder_mlp = MLP([Identity()], [(read_dim+enc_dim), 4*enc_dim], name="LSTM_encoder", **inits) #260 read_dim+dec_dim classifier_mlp = MLP([Identity(),Softmax()], [4*dec_dim, z_dim, target_categories], name="classifier", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawClassifierModel( n_iter, reader=reader, writer=writer, encoder_cnn=encoder_cnn, cnn_mlp=cnn_mlp, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler = q_sampler, classifier=classifier_mlp, flattener=flattener) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') y = tensor.imatrix('targets') probs, h_enc, c_enc, center_y, center_x, delta = draw.reconstruct(x) trim_probs = probs #Only take information from the last iteration labels = y #tensor.lt(y, .5) #Apply a max to probs (get position of max index) #Do the same for labels/dont use one hot cost = (CategoricalCrossEntropy().apply(labels, trim_probs).copy(name='cost')) error_rate = tensor.neq(y.argmax(axis=1), trim_probs.argmax(axis=1)).mean(dtype=theano.config.floatX) cost.name = "BCE" error_rate.name = "error_rate" guesses = labels.argmax(axis=1) #tensor.lt(y, .5)#T.sum(y)#.argmax(axis=0) ps = trim_probs guesses.name = "guesses" ps.name = "probs_shape" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) ) #------------------------------------------------------------------------ # Setup monitors #monitors = [cost,error_rate,guesses,ps] monitors = [cost,error_rate] #monitors = [cost] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), #Checkpoint(name, before_training=True, after_epoch=True, save_separately=['log', 'model']), PartsOnlyCheckpoint("{}/{}".format(subdir,name), before_training=True, after_epoch=True, save_separately=['log', 'model']), ProgressBar(), Printing()]) if oldmodel is not None: print("Initializing parameters with old model %s"%oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_parameter_values(oldmodel.get_parameter_values()) del oldmodel main_loop.run()
def create_main_loop(save_to, num_epochs, unit_order=None, batch_size=500, num_batches=None): image_size = (28, 28) output_size = 10 convnet = create_lenet_5() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) case_costs = CasewiseCrossEntropy().apply(y.flatten(), probs) cost = case_costs.mean().copy(name='cost') error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([cost, error_rate, case_costs]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + sum([0.0003 * (W**2).sum() for W in weights]) cost.name = 'cost_with_regularization' mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=AdaDelta()) synpic_extension = SynpicExtension(synpic_parameters=biases, case_costs=case_costs, case_labels=y, pics=x, batch_size=batch_size, pic_size=image_size, label_count=output_size, after_batch=True) # Impose an orderint for the SaveImages extension if unit_order is not None: with open(unit_order, 'rb') as handle: histograms = pickle.load(handle) unit_order = compute_unit_order(histograms) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), synpic_extension, SaveImages(picsources=[synpic_extension], title="LeNet-5: batch {i}, " + "cost {cost_with_regularization:.2f}, " + "trainerr {error_rate:.3f}", data=[cost, error_rate], graph='error_rate', graph_len=500, unit_order=unit_order, after_batch=True), DataStreamMonitoring([cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] model = Model(cost) main_loop = MainLoop(algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.synpic = synpic_extension return main_loop
def main(feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=None, num_batches=None): if feature_maps is None: feature_maps = [32, 48, 64, 96, 96, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [9, 7, 5, 3, 2, 1] if pool_sizes is None: pool_sizes = [2, 2, 2, 2, 1, 1] if batch_size is None: batch_size = 64 conv_steps=[2, 1, 1, 1, 1, 1] #same as stride image_size = (128, 128) output_size = 2 learningRate = 0.001 drop_prob = 0.4 weight_noise = 0.75 num_epochs = 150 num_batches = None host_plot='http://*****:*****@ %s' % (graph_name, datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) PLOT_AVAILABLE = True except ImportError: PLOT_AVAILABLE = False extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=['log'])) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop( algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float( config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent(step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_{}'.format(side), channels=[ ['dropout_entropy', 'validation_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.1, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def train(model, configs): get_streams = configs['get_streams'] save_path = configs['save_path'] num_epochs = configs['num_epochs'] batch_size = configs['batch_size'] lrs = configs['lrs'] until_which_epoch = configs['until_which_epoch'] grad_clipping = configs['grad_clipping'] monitorings = model.monitorings # Training if configs['weight_noise'] > 0: cg = ComputationGraph(model.cost) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, configs['weight_noise']) model.cost = cg.outputs[0].copy(name='CE') if configs['l2_reg'] > 0: cg = ComputationGraph(model.cost) weights = VariableFilter(roles=[WEIGHT])(cg.variables) new_cost = model.cost + configs['l2_reg'] * sum([ (weight ** 2).sum() for weight in weights]) model.cost = new_cost.copy(name='CE') blocks_model = Model(model.cost) all_params = blocks_model.parameters print "Number of found parameters:" + str(len(all_params)) print all_params default_lr = np.float32(configs['lrs'][0]) lr_var = theano.shared(default_lr, name="learning_rate") clipping = StepClipping(threshold=np.cast[floatX](grad_clipping)) # sgd_momentum = Momentum( # learning_rate=0.0001, # momentum=0.95) # step_rule = CompositeRule([clipping, sgd_momentum]) adam = Adam(learning_rate=lr_var) step_rule = CompositeRule([clipping, adam]) training_algorithm = GradientDescent( cost=model.cost, parameters=all_params, step_rule=step_rule, on_unused_sources='warn') monitored_variables = [ lr_var, aggregation.mean(training_algorithm.total_gradient_norm)] + monitorings for param in all_params: name = param.tag.annotations[0].name + "." + param.name to_monitor = training_algorithm.gradients[param].norm(2) to_monitor.name = name + "_grad_norm" monitored_variables.append(to_monitor) to_monitor = param.norm(2) to_monitor.name = name + "_norm" monitored_variables.append(to_monitor) train_data_stream, valid_data_stream = get_streams(batch_size) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=blocks_model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('valid_CE', blocks_model, save_path, after_epoch=True), SaveLog(after_epoch=True), ProgressBar(), # ErrorPerVideo(model, after_epoch=True, on_interrupt=True), LRDecay(lr_var, lrs, until_which_epoch, after_epoch=True), Printing(after_epoch=True)]) main_loop.run()
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff**2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)])**2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (-scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = { 'cost': cost_name, 'learning_rate': learning_rate, 'momentum': momentum, 'train_cost': df.train_cost.iloc[-1], 'test_cost': df.test_cost.iloc[-1], 'best_test_cost': df.test_cost.min(), 'train_error': df.train_error_rate.iloc[-1], 'test_error': df.test_error_rate.iloc[-1], 'best_test_error': df.test_error_rate.min() } res = { k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items() } json.dump(res, sys.stdout) sys.stdout.flush()
def train(step_rule, layer_size, epochs, seed, experiment_path, initialization, weight_noise, to_watch, patience, z_prob, z_prob_states, z_prob_cells, drop_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, seq_len, input_drop, **kwargs): print '.. CharPTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() def numpy_rng(random_seed=None): if random_seed == None: random_seed = 1223 return numpy.random.RandomState(random_seed) ########################################### # # MAKE STREAMS # ########################################### rng = np.random.RandomState(seed) stream_args = dict(rng=rng, pool_size=pool_size, maximum_frames=maximum_frames, pretrain_alignment=pretrain_alignment, uniform_alignment=uniform_alignment, window_features=window_features) if share_mask: z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None print '.. initializing iterators' train_stream = get_ptb_stream('train', batch_size, seq_len, z_prob_states, z_prob_cells, z_prob_igates, layer_size, False) train_stream_evaluation = get_ptb_stream('train', batch_size, seq_len, z_prob_states, z_prob_cells, z_prob_igates, layer_size, True) dev_stream = get_ptb_stream('valid', batch_size, seq_len, z_prob_states, z_prob_cells, z_prob_igates, layer_size, True) data = train_stream.get_epoch_iterator(as_dict=True).next() ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('features', dtype=floatX) x, y = x[:-1], x[1:] drops_states = T.tensor3('drops_states') drops_cells = T.tensor3('drops_cells') drops_igates = T.tensor3('drops_igates') x.tag.test_value = data['features'] drops_states.tag.test_value = data['drops_states'] drops_cells.tag.test_value = data['drops_cells'] drops_igates.tag.test_value = data['drops_igates'] if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=.2) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hid = Linear(50, layer_size * 4, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn', ogates_zoneout=ogates_zoneout) elif rnn_type.lower() == 'gru': in_to_hid = Linear(50, layer_size * 3, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn') elif rnn_type.lower() == 'srnn': #FIXME!!! make ReLU in_to_hid = Linear(50, layer_size, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn') else: raise NotImplementedError hid_to_out = Linear(layer_size, 50, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) in_to_hid.initialize() recurrent_layer.initialize() hid_to_out.initialize() h = in_to_hid.apply(x) if rnn_type.lower() == 'lstm': yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates)[0] else: yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates) y_hat_pre_softmax = hid_to_out.apply(yh) shape_ = y_hat_pre_softmax.shape # y_hat = Softmax().apply( # y_hat_pre_softmax.reshape((-1, shape_[-1])))# .reshape(shape_) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### def crossentropy_lastaxes(yhat, y): # for sequence of distributions/targets return -(y * T.log(yhat)).sum(axis=yhat.ndim - 1) def softmax_lastaxis(x): # for sequence of distributions return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) yhat = softmax_lastaxis(y_hat_pre_softmax) cross_entropies = crossentropy_lastaxes(yhat, y) cross_entropy = cross_entropies.mean().copy(name="cross_entropy") cost = cross_entropy.copy(name="cost") batch_cost = cost.copy(name='batch_cost') nll_cost = cost.copy(name='nll_cost') bpc = (nll_cost / np.log(2.0)).copy(name='bpr') #nll_cost = aggregation.mean(batch_cost, batch_size).copy(name='nll_cost') cost_monitor = aggregation.mean( batch_cost, batch_size).copy(name='sequence_cost_monitor') cost_per_character = aggregation.mean( batch_cost, (seq_len - 1) * batch_size).copy(name='character_cost') cost_train = cost.copy(name='train_batch_cost') cost_train_monitor = cost_monitor.copy('train_batch_cost_monitor') cg_train = ComputationGraph([cost_train, cost_train_monitor]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': assert 'rnn_apply_states' in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): if output.name == 'rnn_apply_states': norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy('cost_train') cg_train = ComputationGraph([cost_train, cost_train_monitor]) #, norm_cost]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') cost_train_monitor = cg_train.outputs[1].copy( 'train_batch_cost_monitor') ########################################### # # MAKE MODEL # ########################################### model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train_monitor, (seq_len - 1) * batch_size).copy(name='train_character_cost') algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) observed_vars = [ cost_train, cost_train_monitor, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring(variables=[nll_cost, bpc], data_stream=dev_stream, prefix="dev") extensions = [] ########################################### # # LOADING PRETRAINED MODELS (Mohammad Pezeshki) # ########################################### if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name ########################################### # # MOAR EXTENSIONS # ########################################### extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) #train_ctc_monitor, #dev_ctc_monitor]) if test_cost: test_monitor = DataStreamMonitoring( variables=[cost_monitor, cost_per_character], data_stream=test_stream, prefix="test") extensions.append(test_monitor) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_nll_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) ########################################### # # MAIN LOOP # ########################################### main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def build_and_run(save_to,modelconfig,experimentconfig): """ part of this is adapted from lasagne tutorial""" n, num_filters, image_size, num_blockstack = modelconfig['depth'], modelconfig['num_filters'], modelconfig['image_size'], modelconfig['num_blockstack'] print("Amount of bottlenecks: %d" % n) # Prepare Theano variables for inputs and targets input_var = T.tensor4('image_features') #target_value = T.ivector('targets') target_var = T.lmatrix('targets') target_vec = T.extra_ops.to_one_hot(target_var[:,0],2) #target_var = T.matrix('targets') # Create residual net model print("Building model...") network = build_cnn(input_var, image_size, n, num_blockstack, num_filters) get_info(network) prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network)) test_prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network,deterministic=True)) # Loss function -> The objective to minimize print("Instanciation of loss function...") #loss = CategoricalCrossEntropy().apply(target_var.flatten(), prediction) #test_loss = CategoricalCrossEntropy().apply(target_var.flatten(), test_prediction) # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()).mean() # test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var.flatten()).mean() loss = lasagne.objectives.squared_error(prediction,target_vec).mean() test_loss = lasagne.objectives.squared_error(test_prediction,target_vec).mean() # loss = tensor.nnet.binary_crossentropy(prediction, target_var).mean() # test_loss = tensor.nnet.binary_crossentropy(test_prediction, target_var).mean() test_loss.name = "loss" # loss.name = 'x-ent_error' # loss.name = 'sqr_error' layers = lasagne.layers.get_all_layers(network) #l1 and l2 regularization #pondlayers = {x:0.000025 for i,x in enumerate(layers)} #l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2) #l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 25e-6 #reg_penalty = l1_penality + l2_penality #reg_penalty.name = 'reg_penalty' #loss = loss + reg_penalty loss.name = 'reg_loss' error_rate = MisclassificationRate().apply(target_var.flatten(), test_prediction).copy( name='error_rate') # Load the dataset print("Loading data...") istest = 'test' in experimentconfig.keys() if istest: print("Using test stream") train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=istest) # Defining step rule and algorithm if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None : step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate']) else : step_rule=Scale(learning_rate=experimentconfig['learning_rate']) params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network, trainable=True)) print("Initializing algorithm") algorithm = GradientDescent( cost=loss, gradients={var:T.grad(loss,var) for var in params},#parameters=cg.parameters, #params step_rule=step_rule) #algorithm.add_updates(extra_updates) grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = "grad_norm" print("Initializing extensions...") plot = Plot(save_to, channels=[['train_loss','valid_loss'], ['train_grad_norm'], #['train_grad_norm','train_reg_penalty'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042') checkpoint = Checkpoint('models/best_'+save_to+'.tar') # checkpoint.add_condition(['after_n_batches=25'], checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=experimentconfig['num_epochs'], after_n_batches=experimentconfig['num_batches']), TrainingDataMonitoring([test_loss, error_rate, grad_norm], # reg_penalty], prefix="train", after_epoch=True), #after_n_epochs=1 DataStreamMonitoring([test_loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1 plot, #Checkpoint(save_to,after_n_epochs=5), #ProgressBar(), # Plot(save_to, channels=[['train_loss','valid_loss'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042'), #'grad_norm' # after_batch=True), Printing(after_epoch=True), TrackTheBest('valid_error_rate',min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping # model = Model(loss) # print("Model",model) main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) print("Starting main loop...") main_loop.run()
False, port=5551) ########### DEFINE THE ALGORITHM ############# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint("catsVsDogs128.pkl"), ProgressBar(), Printing() ] #Adding a live plot with the bokeh server extensions.append( Plot('CatsVsDogs_128_Layer3', channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], ['train_total_gradient_norm']], after_epoch=True))
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean(generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop( algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100)]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(theano.config.floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=theano.config.floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(named_copy(gain_matrix.min(), 'min_gain')) primary_observables.append(named_copy(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter(applications=[r.bottom.apply], name="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, labels_mask), "weights_entropy") mask_density = named_copy(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = named_copy(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=SpeechModel( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = named_copy( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise param_values = load_parameter_values(params) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') #attention ---> patch_shape = (16, 16); image_shape = (784,100); import numpy import theano.tensor as T n_spatial_dims = 2 cropper = SoftRectangularCropper(n_spatial_dims=n_spatial_dims, patch_shape=patch_shape, image_shape=image_shape, kernel=Gaussian()) batch_size = 10 scales = 1.3**numpy.arange(-7, 6) n_patches = len(scales) locations = (numpy.ones((n_patches, batch_size, 2)) * image_shape/2).astype(numpy.float32) scales = numpy.tile(scales[:, numpy.newaxis, numpy.newaxis], (1, batch_size, 2)).astype(numpy.float32) Tpatches = T.stack(*[cropper.apply(x, T.constant(location), T.constant(scale))[0] for location, scale in zip(locations, scales)]) patches = theano.function([x], Tpatches)(batch['features']) import ipdb as pdb; pdb.set_trace() probs = mlp.apply(tensor.flatten(patches, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run()
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run()
def train(step_rule, label_dim, state_dim, epochs, seed, dropout, test_cost, experiment_path, features, weight_noise, to_watch, patience, batch_size, batch_norm, **kwargs): print '.. TIMIT experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() # ------------------------------------------------------------------------ # Streams rng = np.random.RandomState(seed) stream_args = dict(rng=rng, batch_size=batch_size) print '.. initializing iterators' train_dataset = Timit('train', features=features) train_stream = construct_stream(train_dataset, **stream_args) dev_dataset = Timit('dev', features=features) dev_stream = construct_stream(dev_dataset, **stream_args) test_dataset = Timit('test', features=features) test_stream = construct_stream(test_dataset, **stream_args) update_stream = construct_stream(train_dataset, n_batches=100, **stream_args) phone_dict = train_dataset.get_phoneme_dict() phoneme_dict = {k: phone_to_phoneme_dict[v] if v in phone_to_phoneme_dict else v for k, v in phone_dict.iteritems()} ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()} eol_symbol = ind_to_phoneme['<STOP>'] # ------------------------------------------------------------------------ # Graph print '.. building model' x = T.tensor3('features') y = T.matrix('phonemes') input_mask = T.matrix('features_mask') output_mask = T.matrix('phonemes_mask') theano.config.compute_test_value = 'off' x.tag.test_value = np.random.randn(100, 24, 123).astype(floatX) y.tag.test_value = np.ones((30, 24), dtype=floatX) input_mask.tag.test_value = np.ones((100, 24), dtype=floatX) output_mask.tag.test_value = np.ones((30, 24), dtype=floatX) seq_len = 100 input_dim = 123 activation = Tanh() recurrent_init = IdentityInit(0.99) if batch_norm: rec1 = LSTMBatchNorm(name='rec1', dim=state_dim, activation=activation, weights_init=NormalizedInitialization()) #rec1 = SimpleRecurrentBatchNorm(name='rec1', # dim=state_dim, # activation=activation, # seq_len=seq_len, # weights_init=recurrent_init) #rec2 = SimpleRecurrentBatchNorm(name='rec2', # dim=state_dim, # activation=activation, # seq_len=seq_len, # weights_init=recurrent_init) #rec3 = SimpleRecurrentBatchNorm(name='rec3', # dim=state_dim, # activation=activation, # seq_len=seq_len, # weights_init=recurrent_init) else: rec1 = LSTM(name='rec1', dim=state_dim, activation=activation, weights_init=NormalizedInitialization()) #rec1 = SimpleRecurrent(name='rec1', dim=state_dim, activation=activation, # weights_init=recurrent_init) #rec2 = SimpleRecurrent(name='rec2', dim=state_dim, activation=activation, # weights_init=recurrent_init) #rec3 = SimpleRecurrent(name='rec3', dim=state_dim, activation=activation, # weights_init=recurrent_init) rec1.initialize() #rec2.initialize() #rec3.initialize() s1 = MyRecurrent(rec1, [input_dim, state_dim, label_dim + 1], activations=[Identity(), Identity()], name='s1') #s2 = MyRecurrent(rec2, [state_dim, state_dim, state_dim], # activations=[Identity(), Identity()], name='s2') #s3 = MyRecurrent(rec3, [state_dim, state_dim, label_dim + 1], # activations=[Identity(), Identity()], name='s3') s1.initialize() #s2.initialize() #s3.initialize() o1 = s1.apply(x, input_mask) #o2 = s2.apply(o1) #y_hat_o = s3.apply(o2) y_hat_o = o1 shape = y_hat_o.shape y_hat = Softmax().apply(y_hat_o.reshape((-1, shape[-1]))).reshape(shape) y_mask = output_mask y_hat_mask = input_mask # ------------------------------------------------------------------------ # Costs and Algorithm ctc_cost = T.sum(ctc.cpu_ctc_th( y_hat_o, T.sum(y_hat_mask, axis=0), y + T.ones_like(y), T.sum(y_mask, axis=0))) batch_cost = ctc_cost.copy(name='batch_cost') bs = y.shape[1] cost_train = aggregation.mean(batch_cost, bs).copy("sequence_cost") cost_per_character = aggregation.mean(batch_cost, output_mask.sum()).copy( "character_cost") cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean(cost_train, output_mask.sum()).copy( "train_character_cost") algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters, on_unused_sources='warn') # ------------------------------------------------------------------------ # Monitoring and extensions parameters = model.get_parameter_dict() observed_vars = [cost_train, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm)] for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name + "_norm")) observed_vars.append(algorithm.gradients[param].norm(2).copy(name + "_grad_norm")) train_monitor = TrainingDataMonitoring( variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring( variables=[cost_train, cost_per_character], data_stream=dev_stream, prefix="dev" ) train_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, train_stream, prefix='train', every_n_epochs=1, before_training=True, phoneme_dict=phoneme_dict, black_list=black_list, train=True) dev_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, dev_stream, prefix='dev', every_n_epochs=1, phoneme_dict=phoneme_dict, black_list=black_list) extensions = [] if 'load_path' in kwargs: extensions.append(Load(kwargs['load_path'])) extensions.extend([FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor, train_ctc_monitor, dev_ctc_monitor]) if test_cost: test_monitor = DataStreamMonitoring( variables=[cost_train, cost_per_character], data_stream=test_stream, prefix="test" ) test_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, test_stream, prefix='test', every_n_epochs=1, phoneme_dict=phoneme_dict, black_list=black_list) extensions.append(test_monitor) extensions.append(test_ctc_monitor) #if not os.path.exists(experiment_path): # os.makedirs(experiment_path) #best_path = os.path.join(experiment_path, 'best/') #if not os.path.exists(best_path): # os.mkdir(best_path) #best_path = os.path.join(best_path, 'model.bin') extensions.append(EarlyStopping(to_watch, patience, '/dev/null')) extensions.extend([ProgressBar(), Printing()]) # ------------------------------------------------------------------------ # Main Loop main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) print "Building time: %f" % (time.time() - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run()
def train(model, configs): get_streams = configs['get_streams'] save_path = configs['save_path'] num_epochs = configs['num_epochs'] batch_size = configs['batch_size'] lrs = configs['lrs'] until_which_epoch = configs['until_which_epoch'] grad_clipping = configs['grad_clipping'] monitorings = model.monitorings # Training if configs['weight_noise'] > 0: cg = ComputationGraph(model.cost) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, configs['weight_noise']) model.cost = cg.outputs[0].copy(name='CE') if configs['l2_reg'] > 0: cg = ComputationGraph(model.cost) weights = VariableFilter(roles=[WEIGHT])(cg.variables) new_cost = model.cost + configs['l2_reg'] * sum([ (weight ** 2).sum() for weight in weights]) model.cost = new_cost.copy(name='CE') blocks_model = Model(model.cost) all_params = blocks_model.parameters print "Number of found parameters:" + str(len(all_params)) print all_params default_lr = np.float32(configs['lrs'][0]) lr_var = theano.shared(default_lr, name="learning_rate") clipping = StepClipping(threshold=np.cast[floatX](grad_clipping)) # sgd_momentum = Momentum( # learning_rate=0.0001, # momentum=0.95) # step_rule = CompositeRule([clipping, sgd_momentum]) adam = Adam(learning_rate=lr_var) step_rule = CompositeRule([clipping, adam]) training_algorithm = GradientDescent( cost=model.cost, parameters=all_params, step_rule=step_rule) monitored_variables = [ lr_var, aggregation.mean(training_algorithm.total_gradient_norm)] + monitorings for param in all_params: name = param.tag.annotations[0].name + "." + param.name to_monitor = training_algorithm.gradients[param].norm(2) to_monitor.name = name + "_grad_norm" monitored_variables.append(to_monitor) to_monitor = param.norm(2) to_monitor.name = name + "_norm" monitored_variables.append(to_monitor) train_data_stream, valid_data_stream = get_streams(batch_size) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=blocks_model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('valid_CE', blocks_model, save_path, after_epoch=True), SaveLog(after_epoch=True), ProgressBar(), LRDecay(lr_var, lrs, until_which_epoch, after_epoch=True), Printing(after_epoch=True)]) main_loop.run()
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to, save_separately=['log'], after_batch=True), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run() import cPickle import pandas with open('mnist_log.pkl') as f: log = cPickle.load(f) data_frame = pandas.DataFrame.from_dict(log, orient='index')
def train(step_rule, state_dim, epochs, seed, experiment_path, initialization, to_watch, patience, static_mask, batch_size, rnn_type, num_layers, augment, seq_len, drop_prob, drop_prob_states, drop_prob_cells, drop_prob_igates, ogates_zoneout, stoch_depth, share_mask, gaussian_drop, weight_noise, norm_cost_coeff, penalty, input_drop, **kwargs): print '.. cPTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() def numpy_rng(random_seed=None): if random_seed == None: random_seed = 1223 return numpy.random.RandomState(random_seed) ########################################### # # MAKE DATA STREAMS # ########################################### rng = np.random.RandomState(seed) if share_mask: drop_prob_cells = drop_prob # we don't want to actually use these masks, so this is to debug drop_prob_states = None print '.. initializing iterators' if static_mask: train_stream = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_static_mask_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) else: train_stream = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) data = train_stream.get_epoch_iterator(as_dict=True).next() #import ipdb; ipdb.set_trace() ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('features', dtype=floatX) x, y = x[:-1], x[1:] drops_states = T.tensor3('drops_states') drops_cells = T.tensor3('drops_cells') drops_igates = T.tensor3('drops_igates') x.tag.test_value = data['features'] #y.tag.test_value = data['outputs'] drops_states.tag.test_value = data['drops_states'] drops_cells.tag.test_value = data['drops_cells'] drops_igates.tag.test_value = data['drops_igates'] if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=.2) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hid = Linear(50, state_dim * 4, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutLSTM(dim=state_dim, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn', ogates_zoneout=ogates_zoneout) elif rnn_type.lower() == 'gru': in_to_hid = Linear(50, state_dim * 3, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutGRU(dim=state_dim, weights_init=weights_init, activation=Tanh(), name='rnn') elif rnn_type.lower() == 'srnn': in_to_hid = Linear(50, state_dim, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutSimpleRecurrent(dim=state_dim, weights_init=weights_init, activation=Rectifier(), name='rnn') else: raise NotImplementedError hid_to_out = Linear(state_dim, 50, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) in_to_hid.initialize() recurrent_layer.initialize() hid_to_out.initialize() h = in_to_hid.apply(x) if rnn_type.lower() == 'lstm': yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates)[0] else: yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates) y_hat_pre_softmax = hid_to_out.apply(yh) shape_ = y_hat_pre_softmax.shape # y_hat = Softmax().apply( # y_hat_pre_softmax.reshape((-1, shape_[-1])))# .reshape(shape_) ########################################### # # SET UP COSTS, MONITORS, and REGULARIZATION # ########################################### # cost = CategoricalCrossEntropy().apply(y.flatten().astype('int64'), y_hat) def crossentropy_lastaxes(yhat, y): # for sequence of distributions/targets return -(y * T.log(yhat)).sum(axis=yhat.ndim - 1) def softmax_lastaxis(x): # for sequence of distributions return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) yhat = softmax_lastaxis(y_hat_pre_softmax) cross_entropies = crossentropy_lastaxes(yhat, y) cross_entropy = cross_entropies.mean().copy(name="cross_entropy") cost = cross_entropy.copy(name="cost") batch_cost = cost.copy(name='batch_cost') nll_cost = cost.copy(name='nll_cost') bpc = (nll_cost / np.log(2.0)).copy(name='bpr') #nll_cost = aggregation.mean(batch_cost, batch_size).copy(name='nll_cost') cost_monitor = aggregation.mean( batch_cost, batch_size).copy(name='sequence_cost_monitor') cost_per_character = aggregation.mean( batch_cost, (seq_len - 1) * batch_size).copy(name='character_cost') cost_train = cost.copy(name='train_batch_cost') cost_train_monitor = cost_monitor.copy('train_batch_cost_monitor') cg_train = ComputationGraph([cost_train, cost_train_monitor]) ################## # NORM STABILIZER ################## norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) ## debugging nans stuff #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore') #grf = theano.function([x, input_mask], gr) #grz = grf(x.tag.test_value, input_mask.tag.test_value) #params = cg_train.parameters #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))] #for mm in mynanz: print mm ##import ipdb; ipdb.set_trace() elif penalty == 'hids': assert 'rnn_apply_states' in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): if output.name == 'rnn_apply_states': norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? cg_train = ComputationGraph([cost_train, cost_train_monitor]) #, norm_cost]) ################## # WEIGHT NOISE ################## if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') cost_train_monitor = cg_train.outputs[1].copy( 'train_batch_cost_monitor') # if 'l2regularization' in kwargs: # weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) # cost_train += kwargs['l2regularization'] * sum([ # (weight ** 2).sum() for weight in weights]) # cost_train.name = 'cost_train' # cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train_monitor, (seq_len - 1) * batch_size).copy(name='train_character_cost') algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) observed_vars = [ cost_train, cost_train_monitor, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] # parameters = model.get_parameter_dict() # for name, param in parameters.iteritems(): # observed_vars.append(param.norm(2).copy(name=name + "_norm")) # observed_vars.append( # algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring(variables=[nll_cost, bpc], data_stream=dev_stream, prefix="dev") extensions = [] if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_nll_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) ########################################### # # MAIN LOOOOOOOOOOOP # ########################################### main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run() print "Execution time: %f" % (time.time() - t1)
error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ### Gradient Descent algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learning_rate)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate, error_rate2], data_valid_stream, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] ### Plotting extensions if mode == ("GPU_run" or "data_server"): try: from plot import Plot extensions.append(Plot('%s %s @ %s' % (graph_name, datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) PLOT_AVAILABLE = True except ImportError:
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel, image_size): datasource = name if datasource == 'mnist': if image_size is not None: raise Exception('image size for data source %s is pre configured' % datasource) image_size = 28 else: if image_size is None: raise Exception('Undefined image size for data source %s' % datasource) x_dim = image_size * image_size img_height = img_width = image_size rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (name, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if datasource == 'mnist': train_ds = BinarizedMNIST("train", sources=['features'], flatten=['features']) test_ds = BinarizedMNIST("test", sources=['features'], flatten=['features']) else: datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource + '.hdf5') train_ds = H5PYDataset(datasource_fname, which_set='train', sources=['features'], flatten=['features']) test_ds = H5PYDataset(datasource_fname, which_set='test', sources=['features'], flatten=['features']) train_stream = DataStream(train_ds, iteration_scheme=SequentialScheme( train_ds.num_examples, batch_size)) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), MyCheckpoint(image_size=image_size, path=name + ".pkl", before_training=False, after_epoch=True, save_separately=['log', 'model']), #Dump(name), # Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [dev_monitor, train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_epochs=nepochs), saveload.Load(load_path), saveload.Checkpoint(last_path), ] + track_best('dev_cost', save_path) if learning_rate_decay not in (0, 1):
def main(save_to, model, train, test, num_epochs, input_size = (150,150), learning_rate=0.01, batch_size=50, num_batches=None, flatten_stream=False): """ save_to : where to save trained model model : model given in input must be already initialised (works with convnet and mlp) input_size : the shape of the reshaped image in input (before flattening is applied if flatten_stream is True) """ if flatten_stream : x = tensor.matrix('image_features') else : x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') #Data augmentation #insert data augmentation here #Generating stream train_stream = DataStream.default_stream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size) ) test_stream = DataStream.default_stream( test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size) ) #Reshaping procedure #Add a crop option in scikitresize so that the image is not deformed #Resize to desired square shape train_stream = ScikitResize(train_stream, input_size, which_sources=('image_features',)) test_stream = ScikitResize(test_stream, input_size, which_sources=('image_features',)) #Flattening the stream if flatten_stream is True: train_stream = Flatten(train_stream, which_sources=('image_features',)) test_stream = Flatten(test_stream, which_sources=('image_features',)) # Apply input to model probs = model.apply(x) #Defining cost and various indices to watch #print(probs) #cost = SquaredError().apply(y.flatten(),probs) cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost') error_rate = MisclassificationRate().apply(y.flatten(), probs).copy( name='error_rate') #Building Computation Graph cg = ComputationGraph([cost, error_rate]) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learning_rate)) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), TrainingDataMonitoring([cost, error_rate,aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=5), DataStreamMonitoring([cost, error_rate],test_stream,prefix="test", every_n_batches=25), Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=5)] # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. model = Model(cost) main_loop = MainLoop( algorithm, train_stream, model=model, extensions=extensions) main_loop.run()
def main(save_to, num_epochs): batch_size = 128 dim = 100 n_steps = 20 i2h1 = MLP([Identity()], [784, dim], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) h2o1 = MLP([Rectifier(), Logistic()], [dim, dim, 784], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) rec1 = SimpleRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) i2h1.initialize() h2o1.initialize() rec1.initialize() x = tensor.tensor3('features') x1 = x[1:, :, :] x2 = x[:-1, :, :] preproc = i2h1.apply(x1) h1 = rec1.apply(preproc) x_hat = h2o1.apply(h1) cost = tensor.nnet.binary_crossentropy(x_hat, x2).mean() # cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'final_cost' cg = ComputationGraph([cost, ]) mnist_train = MNIST("train", subset=slice(0, 50000), sources=('features', )) mnist_valid = MNIST("train", subset=slice(50000, 60000), sources=('features',)) mnist_test = MNIST("test") trainstream = Mapping(Flatten(DataStream(mnist_train, iteration_scheme=SequentialScheme(50000, batch_size))), _meanize(n_steps)) validstream = Mapping(Flatten(DataStream(mnist_valid, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps)) teststream = Mapping(Flatten(DataStream(mnist_test, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps)) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([Adam(), StepClipping(100)])) main_loop = MainLoop( algorithm, trainstream, extensions=[Timing(), FinishAfter(after_n_epochs=num_epochs), # DataStreamMonitoring( # [cost, ], # teststream, # prefix="test"), DataStreamMonitoringAndSaving( [cost, ], validstream, [i2h1, h2o1, rec1], 'best_'+save_to+'.pkl', cost_name=cost.name, after_epoch=True, prefix='valid'), TrainingDataMonitoring( [cost, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), # Plot( # save_to, # channels=[ # ['test_final_cost', # 'test_misclassificationrate_apply_error_rate'], # ['train_total_gradient_norm']]), Printing()]) main_loop.run()
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel): datasource = name if datasource == 'mnist': x_dim = 28*28 img_height, img_width = (28, 28) elif datasource == 'sketch': x_dim = 56*56 img_height, img_width = (56, 56) else: raise Exception('Unknown name %s'%datasource) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2*read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "DRAW-%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (name, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.5f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] nll_term = BinaryCrossEntropy().apply(x, x_recons) nll_term.name = "nll_term" kld_term = kl_terms.sum(axis=0).mean() kld_term.name = "kld_term" nll_bound = nll_term + kld_term nll_bound.name = "nll_bound" # grab the computation graph for the VFE bound on NLL cg = ComputationGraph([nll_bound]) params = VariableFilter(roles=[PARAMETER])(cg.variables) # apply some l2 regularization to the model parameters reg_term = 1e-5 * sum([tensor.sum(p**2.0) for p in params]) reg_term.name = "reg_term" # compute the final cost of VFE + regularization cost = nll_bound + reg_term cost.name = "full_cost" algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost, nll_bound] for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "valid_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if datasource == 'mnist': mnist_train = BinarizedMNIST("train", sources=['features'], flatten=['features']) mnist_valid = BinarizedMNIST("test", sources=['features'], flatten=['features']) # mnist_test = BinarizedMNIST("test", sources=['features'], flatten=['features']) train_stream = DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, batch_size)) valid_stream = DataStream(mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, batch_size)) # test_stream = DataStream(mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size)) else: raise Exception('Unknown name %s'%datasource) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), DataStreamMonitoring( monitors, valid_stream, prefix="valid"), # DataStreamMonitoring( # monitors, # test_stream, # prefix="test"), Checkpoint(name+".pkl", after_epoch=True, save_separately=['log', 'model']), # Dump(name), Plot(name, channels=plot_channels), ProgressBar(), Printing()]) if oldmodel is not None: print("Initializing parameters with old model %s"%oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) if name is None: tag = "watt" if attention else "woatt" lr_str = lr_tag(learning_rate) name = "%s-t%d-enc%d-dec%d-z%d-lr%s" % (tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #------------------------------------------------------------------------ x_dim = 28*28 img_height, img_width = (28, 28) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention: read_N = 4 write_N = 7 read_dim = 2*read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=read_N, **inits) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Tanh()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Tanh()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(3.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] """ for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t x_recons_t = T.nnet.sigmoid(c[t,:,:]) recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) recons_term_t = recons_term_t.mean() recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t, recons_term_t] """ train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], ["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX(DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX(DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), ## updates=scan_updates, prefix="test"), TrainingDataMonitoring( train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name+".pkl"), Plot(name, channels=plot_channels), ProgressBar(), Printing()]) main_loop.run()
def main(save_to, num_epochs, regularization=0.0003, subset=None, num_batches=None, histogram=None, resume=False): batch_size = 500 output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) components = (ComponentwiseCrossEntropy().apply(y.flatten(), probs) .copy(name='components')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) cg = ComputationGraph([cost, error_rate, components]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' cost = cost + regularization * l2_norm cost.name = 'cost_with_regularization' if subset: start = 30000 - subset // 2 mnist_train = MNIST(("train",), subset=slice(start, start+subset)) else: mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=AdaDelta(decay_rate=0.99)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [cost, error_rate, confusion], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, l2_norm, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(save_to, num_epochs, regularization=0.001, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 convnet = create_all_conv_net() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) test_components = (ComponentwiseCrossEntropy().apply( y.flatten(), probs).copy(name='components')) test_error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), probs).copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate, test_components]) # Apply dropout to all layer outputs except final softmax dropout_vars = VariableFilter( roles=[OUTPUT], bricks=[Convolutional], theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(drop_cg.variables) # train_cg = apply_dropout(drop_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(drop_cg, [x], 0.2) train_cg = drop_cg # train_cg = test_cg train_cost, train_error_rate, train_components = train_cg.outputs # Apply regularization to the cost biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) l2_norm = sum([(W**2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = regularization * l2_norm l2_regularization.name = 'l2_regularization' test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + regularization * l2_norm train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train", )) #cifar10_train_stream = RandomPadCropFlip( # NormalizeBatchLevels(DataStream.default_stream( # cifar10_train, iteration_scheme=ShuffledScheme( # cifar10_train.num_examples, batch_size)), # which_sources=('features',)), # (32, 32), pad=5, which_sources=('features',)) cifar10_train_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme(cifar10_train.num_examples, batch_size)), which_sources=('features', )) test_batch_size = 1000 cifar10_test = CIFAR10(("test", )) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme(cifar10_test.num_examples, test_batch_size)), which_sources=('features', )) momentum = Momentum(0.002, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # step_rule = CompositeRule([StepClipping(100), momentum]) step_rule = momentum # Train with simple SGD algorithm = GradientDescent(cost=train_cost, parameters=train_cg.parameters, step_rule=step_rule) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [(1, 0.005), (3, 0.01), (5, 0.02), (200, 0.002), (250, 0.0002), (300, 0.00002)]), DataStreamMonitoring([test_cost, test_error_rate, test_confusion], cifar10_test_stream, prefix="test"), TrainingDataMonitoring([ train_cost, train_error_rate, train_cost_without_regularization, l2_regularization, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", every_n_batches=10), # after_epoch=True), Plot('Training performance for ' + save_to, channels=[ [ 'train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization' ], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=10), # after_batch=True), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] if histogram: attribution = AttributionExtension(components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop(algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean( batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean().copy( name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append(parameter.norm(2).copy(name + "_norm")) observables.append(algorithm.gradients[parameter].norm(2).copy( name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( applications=[reverser.generator.generate], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: try: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) except EOFError: break except Exception: traceback.print_exc() continue encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weghts_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( application=generator.readout.readout, name="output")(cg.variables) (activations,) = VariableFilter( application=generator.transition.apply, name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_param_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)