def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset(generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset(generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar()]) main_loop.run() print 'Learned weights:' for layer in (x_to_h, lstm, h_to_o): print "Layer '%s':" % layer.name for param in layer.parameters: print param.name, ': ', param.get_value() print
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5('features') # shape: B x Classes target = T.lmatrix('targets') model = LSTMAttention( configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply(input_) classifier = MLP( [Rectifier(), Logistic()], configs['classifier_dims'], weights_init=Glorot(), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = BinaryCrossEntropy().apply(target, probabilities) cost.name = 'CE' error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = 'ER' model.cost = cost if configs['load_pretrained']: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open('VGG_CNN_params.npz') as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs['test_model']: cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input='ignore', allow_input_downcast=True) data = np.random.randn(10, 40, 3, 224, 224) targs = np.random.randn(40, 101) f(data, targs) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
mlp = MLP(activations=[Logistic(), Softmax()], dims=[117, 55, 2], weights_init=IsotropicGaussian(), biases_init=Constant(0.01)) mlp.initialize() y_hat = mlp.apply(x) cost = BinaryCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.001 * abs(W1).sum() + 0.001 * abs(W2).sum() cost.name = 'cost' error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = 'error_rate' algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) train_set = H5PYDataset('mushrooms.hdf5', which_sets=('train', )) train_stream = DataStream.default_stream(train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128)) test_set = H5PYDataset('mushrooms.hdf5', which_sets=('test', ))
mlp = MLP(activations=[Logistic(), Softmax()], dims=[117, 55, 2], weights_init=IsotropicGaussian(), biases_init=Constant(0.01)) mlp.initialize() y_hat = mlp.apply(x) cost = BinaryCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.001 * abs(W1).sum() + 0.001 * abs(W2).sum() cost.name = 'cost' error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = 'error_rate' algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) train_set = H5PYDataset('mushrooms.hdf5', which_sets=('train',)) train_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128)) test_set = H5PYDataset('mushrooms.hdf5', which_sets=('test',)) test_stream = DataStream.default_stream(
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel, max_total_duration, results_root_dir, nosync, adam_args_json, rmsprop_args_json): image_size, channels, data_train, data_valid, data_test = datasets.get_data(dataset) # Sequential scheme as originally implemented. #train_stream = Flatten(DataStream.default_stream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size))) #valid_stream = Flatten(DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size))) #test_stream = Flatten(DataStream.default_stream(data_test, iteration_scheme=SequentialScheme(data_test.num_examples, batch_size))) # Shuffled version makes more sense for distributed training. train_stream = Flatten(DataStream.default_stream(data_train, iteration_scheme=ShuffledScheme(data_train.num_examples, batch_size))) valid_stream = Flatten(DataStream.default_stream(data_valid, iteration_scheme=ShuffledScheme(data_valid.num_examples, batch_size))) test_stream = Flatten(DataStream.default_stream(data_test, iteration_scheme=ShuffledScheme(data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = channels * img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # Configure attention mechanism if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * channels * read_N ** 2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, channels=channels, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, channels=channels, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- if name is None: name = dataset if os.environ.has_key('MOAB_JOBARRAYINDEX'): name = name + ("-%0.3d" % int(os.environ['MOAB_JOBARRAYINDEX'])) # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S"); if results_root_dir is not None: subdir = os.path.join(subdir, results_root_dir) longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print(" max_total_duration: %d" % max_total_duration) print(" nosync: %s" % str(nosync)) print(" adam_args_json: %s" % str(adam_args_json)) print(" rmsprop_args_json: %s" % str(rmsprop_args_json)) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') x_recons, kl_terms = draw.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) if adam_args_json is not None: print("Setup for Adam with arguments passed by command-line.") import json adam_args = json.loads(adam_args_json) if learning_rate is not None: adam_args['learning_rate'] = learning_rate algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(**adam_args), ]) ) elif rmsprop_args_json is not None: print("Setup for RMSProp with arguments passed by command-line.") import json rmsprop_args = json.loads(rmsprop_args_json) if learning_rate is not None: rmsprop_args['learning_rate'] = learning_rate algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), RMSProp(**rmsprop_args), ]) ) else: print("Setup for Adam by default.") # default original behavior algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------------------ # Setup for legion cg = ComputationGraph(cost) params_to_sync = {} #cg.variables counter = 0 print("---- cg.parameters ----") for p in cg.parameters: # `p` is of type theano.sandbox.cuda.var.CudaNdarraySharedVariable # Warning. This is not as deterministic as we would want. # For now, however, we don't have much of a choice. new_name = p.name while params_to_sync.has_key(new_name): counter += 1 new_name = p.name + ("_%d" % counter) params_to_sync[new_name] = p print("Parameter %s now referred to as %s." % (p.name, new_name)) #import pdb; pdb.set_trace() print("---- --.---------- ----") #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) extensions = [ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), PartsOnlyCheckpoint("{}/{}".format(subdir, name), before_training=True, after_epoch=True, save_separately=['log', 'model']) #SampleCheckpoint(image_size=image_size[0], channels=channels, save_subdir=subdir, before_training=True, after_epoch=True), ] if not nosync: # With parameter sync on legion, #extensions.append(SharedParamsRateLimited(params=params_to_sync, before_training=True, alpha=1.0, beta=0.0, maximum_rate=1.0)) #extensions.append(SharedParamsRateLimited(params=params_to_sync, before_training=True, every_n_batches=4, alpha=0.5, beta=0.5, maximum_rate=0.2, want_sync_timing_log=True)) extensions.append(SharedParamsRateLimited(params=params_to_sync, before_training=True, every_n_batches=1, alpha=0.99, beta=0.01, maximum_rate=0.2, want_sync_timing_log=True)) extensions = extensions + [StopAfterTimeElapsed(every_n_batches=4, total_duration=max_total_duration), # Timing information to facilitate plotting. Timing(every_n_epochs=1), Timestamp(every_n_batches=4), # Plot(name, channels=plot_channels), ProgressBar(), Printing()] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model %s"%oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_parameter_values(oldmodel.get_parameter_values()) del oldmodel main_loop.run()
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset( generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset( generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() print('Learned weights:') for layer in (x_to_h, lstm, h_to_o): print("Layer '%s':" % layer.name) for param in layer.parameters: print(param.name, ': ', param.get_value()) print() return main_loop
y = tensor.lmatrix(u'targets') #y = theano.tensor.extra_ops.to_one_hot(tensor.lmatrix(u'targets'),2) probs, h_enc, c_enc, h_dec, c_dec, center_y, center_x, delta = draw.reconstruct(x) #probs, h_enc, c_enc, center_y, center_x, delta = draw.reconstruct(x) #trim_probs = probs[-1,:] #Only take information from the last iteration trim_probs = probs #Only take information from the last iteration labels = y cost = BinaryCrossEntropy().apply(labels, trim_probs) #cost = SquaredError().apply(labels,trim_probs) #cost = AbsoluteError().apply(tensor.concatenate([center_y, center_x, deltaY, deltaX]), tensor.concatenate([orig_y, orig_x, orig_dy, orig_dx])) #cost = (CategoricalCrossEntropy().apply(labels, trim_probs).copy(name='cost')) #cost = tensor.nnet.categorical_crossentropy(trim_probs, labels) #error_rate = tensor.neq(labels, trim_probs).mean(dtype=theano.config.floatX) error_rate = tensor.neq(labels, trim_probs.argmax(axis=0)).mean(dtype=theano.config.floatX) #error_rate = (MisclassificationRate().apply(labels, trim_probs).copy(name='error_rate')) cost.name = "BCE" error_rate.name = "error_rate" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate),
def train(self): #print(self.sharedBatch.keys()) x = self.sharedBatch['x'] x.name = 'x_myinput' xmask = self.sharedBatch['xmask'] xmask.name = 'xmask_myinput' xmini = self.sharedBatch['xmini'] xmini.name = 'xmini_myinput' xmini_mask = self.sharedBatch['xmini_mask'] xmini_mask.name = 'xmini_mask_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' xattr = self.sharedBatch['xattr'] xattr.name = 'xattr_myinput' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx, self.dim, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) xmini_to_h = Linear(self.input_dimxmini, self.mini_dim * 4, name='xmini_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) comb_to_h = Linear(self.input_dimxattr + self.summary_dim, self.input_dimxattr + self.summary_dim, name='comb_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstmwmini = LSTMwMini(dim=self.dim, mini_dim=self.mini_dim, summary_dim=self.summary_dim) mlp = MLP( activations=[Rectifier()], dims=[self.summary_dim + self.input_dimxattr, self.summary_dim], weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(self.summary_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) xmini_transform = xmini_to_h.apply(xmini) h, c = lstmwmini.apply(x=x_transform, xmini=xmini_transform, xmask=xmask, xmini_mask=xmini_mask) attr_and_rnn = T.concatenate([xattr, h[-1]], axis=1) #self.f = theano.function(inputs = [], outputs=attr_and_rnn) #print(self.summary_dim) #print(self.input_dimx) #print("self.f === ") #print(self.f()) #print(self.f().shape) #print("====") comb_transform = comb_to_h.apply(attr_and_rnn) mlp_transform = mlp.apply(comb_transform) # only values of hidden units of the last timeframe are used for # the classification #y_hat = h_to_o.apply(h[-1]) y_hat = h_to_o.apply(mlp_transform) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstmwmini.initialize() x_to_h.initialize() xmini_to_h.initialize() comb_to_h.initialize() mlp.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) #print("self.f === ") #print(self.f()) #print(self.f().shape) #print("====") self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)
def main(name, epochs, batch_size, learning_rate, attention, n_iter, mix_dim, write_dim, enc_dim, dec_dim, z_dim, oldmodel): datasource = name if datasource == 'mnist': x_dim = 28*28 img_height, img_width = (28, 28) elif datasource == 'sketch': x_dim = 56*56 img_height, img_width = (56, 56) else: raise Exception('Unknown name %s'%datasource) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # if attention != "": # read_N, write_N = attention.split(',') # read_N = int(read_N) # write_N = int(write_N) # read_dim = 2*read_N**2 # reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, # width=img_width, height=img_height, # N=read_N, **inits) # writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, # width=img_width, height=img_height, # N=write_N, **inits) # attention_tag = "r%d-w%d" % (read_N, write_N) # else: # read_dim = 2*x_dim # reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) # writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) # attention_tag = "full" attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "BIKLD-%s-%s-t%d-enc%d-dec%d-z%d-md%d-wd%d-lr%s" % \ (name, attention_tag, n_iter, enc_dim, dec_dim, z_dim, mix_dim, write_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.5f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #---------------------------------------------------------------------- # setup the reader and writer read_dim = 2*x_dim reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \ name="writer_mlp", **inits) attention_tag = "full" # setup the mixture weight sampler mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_enc_mlp", **inits) mix_dec_mlp = MLP([Tanh(), Tanh()], \ [mix_dim, 250, (2*enc_dim + 2*dec_dim + mix_dim)], \ name="mix_dec_mlp", **inits) # setup the components of the generative DRAW model enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4*enc_dim], \ name="enc_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [ (z_dim + mix_dim), 4*dec_dim], \ name="dec_mlp_in", **inits) enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits) dec_mlp_out = CondNet([], [dec_dim, z_dim], name="dec_mlp_out", **inits) enc_rnn = LSTM(dim=enc_dim, name="enc_rnn", **rnninits) dec_rnn = LSTM(dim=dec_dim, name="dec_rnn", **rnninits) draw = IMoDrawModels( n_iter, mix_enc_mlp=mix_enc_mlp, mix_dec_mlp=mix_dec_mlp, reader_mlp=reader_mlp, enc_mlp_in=enc_mlp_in, enc_mlp_out=enc_mlp_out, enc_rnn=enc_rnn, dec_mlp_in=dec_mlp_in, dec_mlp_out=dec_mlp_out, dec_rnn=dec_rnn, writer_mlp=writer_mlp) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') # collect reconstructions of x produced by the IMoDRAW model x_recons, kl_q2p, kl_p2q = draw.reconstruct(x, x) # get the expected NLL part of the VFE bound nll_term = BinaryCrossEntropy().apply(x, x_recons) nll_term.name = "nll_term" # get KL(q || p) and KL(p || q) kld_q2p_term = kl_q2p.sum(axis=0).mean() kld_q2p_term.name = "kld_q2p_term" kld_p2q_term = kl_p2q.sum(axis=0).mean() kld_p2q_term.name = "kld_p2q_term" # get the proper VFE bound on NLL nll_bound = nll_term + kld_q2p_term nll_bound.name = "nll_bound" # grab handles for all the optimizable parameters in our cost cg = ComputationGraph([nll_bound]) params = VariableFilter(roles=[PARAMETER])(cg.variables) # apply some l2 regularization to the model parameters reg_term = (1e-5 * sum([tensor.sum(p**2.0) for p in params])) reg_term.name = "reg_term" # compute the full cost w.r.t. which we will optimize total_cost = nll_term + (0.9 * kld_q2p_term) + \ (0.1 * kld_p2q_term) + reg_term total_cost.name = "total_cost" algorithm = GradientDescent( cost=total_cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) ) #------------------------------------------------------------------------ # Setup monitors monitors = [total_cost, nll_bound, nll_term, kld_q2p_term, kld_p2q_term, reg_term] for t in range(n_iter+1): kl_q2p_t = kl_q2p[t,:].mean() kl_q2p_t.name = "kl_q2p_%d" % t kl_p2q_t = kl_p2q[t,:].mean() kl_p2q_t.name = "kl_p2q_%d" % t monitors += [kl_q2p_t, kl_p2q_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "valid_nll_bound"], ["train_kl_q2p_%d" % t for t in range(n_iter+1)], ["train_kl_p2q_%d" % t for t in range(n_iter+1)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if datasource == 'mnist': mnist_train = BinarizedMNIST("train", sources=['features'], flatten=['features']) mnist_valid = BinarizedMNIST("test", sources=['features'], flatten=['features']) #mnist_test = BinarizedMNIST("test", sources=['features'], flatten=['features']) train_stream = DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, batch_size)) valid_stream = DataStream(mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, batch_size)) #test_stream = DataStream(mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size)) else: raise Exception('Unknown name %s'%datasource) main_loop = MainLoop( model=Model(total_cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), DataStreamMonitoring( monitors, valid_stream, prefix="valid"), # DataStreamMonitoring( # monitors, # test_stream, # prefix="test"), Checkpoint(name+".pkl", after_epoch=True, save_separately=['log', 'model']), # Plot(name, channels=plot_channels), ProgressBar(), Printing()]) if oldmodel is not None: print("Initializing parameters with old model %s"%oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): if name is None: tag = "watt" if attention else "woatt" name = "%s-t%d-enc%d-dec%d-z%d" % (tag, n_iter, enc_dim, dec_dim, z_dim) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #------------------------------------------------------------------------ x_dim = 28 * 28 img_height, img_width = (28, 28) rnninits = { 'weights_init': Orthogonal(), #'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } inits = { 'weights_init': Orthogonal(), #'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } prior_mu = T.zeros([z_dim]) prior_log_sigma = T.zeros([z_dim]) if attention: read_N = 4 write_N = 6 read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=read_N, **inits) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) encoder = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Tanh()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Tanh()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) for brick in [ reader, writer, encoder, decoder, encoder_mlp, decoder_mlp, q_sampler ]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') # This is one iteration def one_iteration(c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec, x): x_hat = x - T.nnet.sigmoid(c) r = reader.apply(x, x_hat, h_dec) i_enc = encoder_mlp.apply(T.concatenate([r, h_dec], axis=1)) h_enc, c_enc = encoder.apply(states=h_enc, cells=c_enc, inputs=i_enc, iterate=False) z_mean, z_log_sigma, z = q_sampler.apply(h_enc) i_dec = decoder_mlp.apply(z) h_dec, c_dec = decoder.apply(states=h_dec, cells=c_dec, inputs=i_dec, iterate=False) c = c + writer.apply(h_dec) return c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec outputs_info = [ T.zeros([batch_size, x_dim]), # c T.zeros([batch_size, enc_dim]), # h_enc T.zeros([batch_size, enc_dim]), # c_enc T.zeros([batch_size, z_dim]), # z_mean T.zeros([batch_size, z_dim]), # z_log_sigma T.zeros([batch_size, z_dim]), # z T.zeros([batch_size, dec_dim]), # h_dec T.zeros([batch_size, dec_dim]), # c_dec ] outputs, scan_updates = theano.scan(fn=one_iteration, sequences=[], outputs_info=outputs_info, non_sequences=[x], n_steps=n_iter) c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec = outputs kl_terms = (prior_log_sigma - z_log_sigma + 0.5 * (tensor.exp(2 * z_log_sigma) + (z_mean - prior_mu)**2) / tensor.exp(2 * prior_log_sigma) - 0.5).sum(axis=-1) x_recons = T.nnet.sigmoid(c[-1, :, :]) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ #StepClipping(3.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t x_recons_t = T.nnet.sigmoid(c[t, :, :]) recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) recons_term_t = recons_term_t.mean() recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t, recons_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], ["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"]] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=None, data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), updates=scan_updates, prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel): datasource = name if datasource == 'mnist': x_dim = 28*28 img_height, img_width = (28, 28) elif datasource == 'sketch': x_dim = 56*56 img_height, img_width = (56, 56) else: raise Exception('Unknown name %s'%datasource) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2*read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "DRAW-%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (name, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.5f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] nll_term = BinaryCrossEntropy().apply(x, x_recons) nll_term.name = "nll_term" kld_term = kl_terms.sum(axis=0).mean() kld_term.name = "kld_term" nll_bound = nll_term + kld_term nll_bound.name = "nll_bound" # grab the computation graph for the VFE bound on NLL cg = ComputationGraph([nll_bound]) params = VariableFilter(roles=[PARAMETER])(cg.variables) # apply some l2 regularization to the model parameters reg_term = 1e-5 * sum([tensor.sum(p**2.0) for p in params]) reg_term.name = "reg_term" # compute the final cost of VFE + regularization cost = nll_bound + reg_term cost.name = "full_cost" algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost, nll_bound] for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "valid_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if datasource == 'mnist': mnist_train = BinarizedMNIST("train", sources=['features'], flatten=['features']) mnist_valid = BinarizedMNIST("test", sources=['features'], flatten=['features']) # mnist_test = BinarizedMNIST("test", sources=['features'], flatten=['features']) train_stream = DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, batch_size)) valid_stream = DataStream(mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, batch_size)) # test_stream = DataStream(mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size)) else: raise Exception('Unknown name %s'%datasource) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), DataStreamMonitoring( monitors, valid_stream, prefix="valid"), # DataStreamMonitoring( # monitors, # test_stream, # prefix="test"), Checkpoint(name+".pkl", after_epoch=True, save_separately=['log', 'model']), # Dump(name), Plot(name, channels=plot_channels), ProgressBar(), Printing()]) if oldmodel is not None: print("Initializing parameters with old model %s"%oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(name, model, epochs, batch_size, learning_rate, bokeh, layers, gamma, rectifier, predict, dropout, qlinear, sparse): runname = "vae%s-L%s%s%s%s-l%s-g%s-b%d" % (name, layers, 'r' if rectifier else '', 'd' if dropout else '', 'l' if qlinear else '', shnum(learning_rate), shnum(gamma), batch_size//100) if rectifier: activation = Rectifier() full_weights_init = Orthogonal() else: activation = Tanh() full_weights_init = Orthogonal() if sparse: runname += '-s%d'%sparse weights_init = Sparse(num_init=sparse, weights_init=full_weights_init) else: weights_init = full_weights_init layers = map(int,layers.split(',')) encoder_layers = layers[:-1] encoder_mlp = MLP([activation] * (len(encoder_layers)-1), encoder_layers, name="MLP_enc", biases_init=Constant(0.), weights_init=weights_init) enc_dim = encoder_layers[-1] z_dim = layers[-1] if qlinear: sampler = Qlinear(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) else: sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) decoder_layers = layers[:] ## includes z_dim as first layer decoder_layers.reverse() decoder_mlp = MLP([activation] * (len(decoder_layers)-2) + [Logistic()], decoder_layers, name="MLP_dec", biases_init=Constant(0.), weights_init=weights_init) vae = VAEModel(encoder_mlp, sampler, decoder_mlp) vae.initialize() x = tensor.matrix('features')/256. x.tag.test_value = np.random.random((batch_size,layers[0])).astype(np.float32) if predict: mean_z, enc = vae.mean_z(x) # cg = ComputationGraph([mean_z, enc]) newmodel = Model([mean_z,enc]) else: x_recons, kl_terms = vae.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.mean() cg = ComputationGraph([cost]) if gamma > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost += gamma * blocks.theano_expressions.l2_norm(weights) cost.name = "nll_bound" newmodel = Model(cost) if dropout: from blocks.roles import INPUT inputs = VariableFilter(roles=[INPUT])(cg.variables) # dropout_target = [v for k,v in newmodel.get_params().iteritems() # if k.find('MLP')>=0 and k.endswith('.W') and not k.endswith('MLP_enc/linear_0.W')] dropout_target = filter(lambda x: x.name.startswith('linear_'), inputs) cg = apply_dropout(cg, dropout_target, 0.5) target_cost = cg.outputs[0] else: target_cost = cost if name == 'mnist': if predict: train_ds = MNIST("train") else: train_ds = MNIST("train", sources=['features']) test_ds = MNIST("test") else: datasource_dir = os.path.join(fuel.config.data_path, name) datasource_fname = os.path.join(datasource_dir , name+'.hdf5') if predict: train_ds = H5PYDataset(datasource_fname, which_set='train') else: train_ds = H5PYDataset(datasource_fname, which_set='train', sources=['features']) test_ds = H5PYDataset(datasource_fname, which_set='test') train_s = Flatten(DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size))) test_s = Flatten(DataStream(test_ds, iteration_scheme=ShuffledScheme( test_ds.num_examples, batch_size))) if predict: from itertools import chain fprop = newmodel.get_theano_function() allpdata = None alledata = None f = train_s.sources.index('features') assert f == test_s.sources.index('features') sources = test_s.sources alllabels = dict((s,[]) for s in sources if s != 'features') for data in chain(train_s.get_epoch_iterator(), test_s.get_epoch_iterator()): for s,d in zip(sources,data): if s != 'features': alllabels[s].extend(list(d)) pdata, edata = fprop(data[f]) if allpdata is None: allpdata = pdata else: allpdata = np.vstack((allpdata, pdata)) if alledata is None: alledata = edata else: alledata = np.vstack((alledata, edata)) print 'Saving',allpdata.shape,'intermidiate layer, for all training and test examples, to',name+'_z.npy' np.save(name+'_z', allpdata) print 'Saving',alledata.shape,'last encoder layer to',name+'_e.npy' np.save(name+'_e', alledata) print 'Saving additional labels/targets:',','.join(alllabels.keys()), print ' of size',','.join(map(lambda x: str(len(x)),alllabels.values())), print 'to',name+'_labels.pkl' with open(name+'_labels.pkl','wb') as fp: pickle.dump(alllabels, fp, -1) else: cg = ComputationGraph([target_cost]) algorithm = GradientDescent( cost=target_cost, params=cg.parameters, step_rule=Adam(learning_rate) # Scale(learning_rate=learning_rate) ) extensions = [] if model: extensions.append(Load(model)) extensions += [Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( [cost, recons_term], test_s, prefix="test"), TrainingDataMonitoring( [cost, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(runname, every_n_epochs=10), Printing()] if bokeh: extensions.append(Plot( 'Auto', channels=[ ['test_recons_term','test_nll_bound','train_nll_bound' ], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, train_s, model=newmodel, extensions=extensions) main_loop.run()
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX(DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX(DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring( train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name+".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing()]) main_loop.run()
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def test_communication(path_vae_mnist, path_maxout_mnist): # load models vae_mnist = load(path_vae_mnist) # get params : to be remove from the computation graph # write an object maxout classifier = Maxout() # get params : to be removed from the computation graph # vae whose prior is a zero mean unit variance normal distribution activation = Rectifier() full_weights_init = Orthogonal() weights_init = full_weights_init # SVHN en niveau de gris layers = [32*32, 200, 200, 200, 50] encoder_layers = layers[:-1] encoder_mlp = MLP([activation] * (len(encoder_layers)-1), encoder_layers, name="MLP_SVHN_encode", biases_init=Constant(0.), weights_init=weights_init) enc_dim = encoder_layers[-1] z_dim = layers[-1] sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) decoder_layers = layers[:] ## includes z_dim as first layer decoder_layers.reverse() decoder_mlp = MLP([activation] * (len(decoder_layers)-2) + [Rectifier()], decoder_layers, name="MLP_SVHN_decode", biases_init=Constant(0.), weights_init=weights_init) vae_svhn = VAEModel(encoder_mlp, sampler, decoder_mlp) vae_svhn.initialize() # do the connection x = T.tensor4('x') # SVHN samples preprocessed with local contrast normalization x_ = (T.sum(x, axis=1)).flatten(ndim=2) y = T.imatrix('y') batch_size = 512 svhn_z, _ = vae_svhn.sampler.sample(vae_svhn.encoder_mlp.apply(x_)) mnist_decode = vae_mnist.decoder_mlp.apply(svhn_z) # reshape shape = mnist_decode.shape mnist_decode = mnist_decode.reshape((shape[0], 1, 28, 28)) prediction = classifier.apply(mnist_decode) y_hat = Softmax().apply(prediction) x_recons, kl_terms = vae_svhn.reconstruct(x_) recons_term = BinaryCrossEntropy().apply(x_, T.clip(x_recons, 1e-4, 1 - 1e-4)) recons_term.name = "recons_term" cost_A = recons_term + kl_terms.mean() cost_A.name = "cost_A" cost_B = Softmax().categorical_cross_entropy(y.flatten(), prediction) cost_B.name = 'cost_B' cost = cost_B cost.name = "cost" cg = ComputationGraph(cost) # probably discard some of the parameters parameters = cg.parameters params = [] for t in parameters: if not re.match(".*mnist", t.name): params.append(t) """ f = theano.function([x], cost_A) value_x = np.random.ranf((1, 3, 32, 32)).astype("float32") print f(value_x) return """ error_brick = MisclassificationRate() error_rate = error_brick.apply(y.flatten(), y_hat) error_rate.name = "error_rate" # training here step_rule = RMSProp(0.001,0.99) dataset_hdf5_file="/Tmp/ducoffem/SVHN/" train_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='train') test_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='valid') data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(2000, batch_size)) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train", every_n_batches=10) monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="valid", every_n_batches=10) # drawing_samples = ImagesSamplesSave("../data_svhn", vae, (3, 32, 32), every_n_epochs=1) extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_batches=10000), Printing(every_n_batches=10) ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run()
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) if name is None: tag = "watt" if attention else "woatt" lr_str = lr_tag(learning_rate) name = "%s-t%d-enc%d-dec%d-z%d-lr%s" % (tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #------------------------------------------------------------------------ x_dim = 28*28 img_height, img_width = (28, 28) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention: read_N = 4 write_N = 7 read_dim = 2*read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=read_N, **inits) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Tanh()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Tanh()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(3.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] """ for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t x_recons_t = T.nnet.sigmoid(c[t,:,:]) recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) recons_term_t = recons_term_t.mean() recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t, recons_term_t] """ train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], ["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX(DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX(DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), ## updates=scan_updates, prefix="test"), TrainingDataMonitoring( train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name+".pkl"), Plot(name, channels=plot_channels), ProgressBar(), Printing()]) main_loop.run()
def test_vae(): activation = Rectifier() full_weights_init = Orthogonal() weights_init = full_weights_init layers = [3*32*32, 500, 100, 100, 80] encoder_layers = layers[:-1] encoder_mlp = MLP([activation] * (len(encoder_layers)-1), encoder_layers, name="MLP_enc", biases_init=Constant(0.), weights_init=weights_init) enc_dim = encoder_layers[-1] z_dim = layers[-1] #sampler = Qlinear(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) decoder_layers = layers[:] ## includes z_dim as first layer decoder_layers.reverse() decoder_mlp = MLP([activation] * (len(decoder_layers)-2) + [Rectifier()], decoder_layers, name="MLP_dec", biases_init=Constant(0.), weights_init=weights_init) vae = VAEModel(encoder_mlp, sampler, decoder_mlp) vae.initialize() x = T.matrix('features') batch_size = 128 x_recons, kl_terms = vae.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, T.clip(x_recons, 1e-4, 1 - 1e-4)) recons_term.name = "recons_term" cost = recons_term + kl_terms.mean() cost.name = "cost" cg = ComputationGraph([cost]) step_rule = Momentum(0.001, 0.2) dataset_path="/Tmp/ducoffem/SVHN/all.h5" train_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='train') test_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='valid') data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size)) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train") monitor_valid = DataStreamMonitoring( variables=[cost], data_stream=data_stream_test, prefix="valid") drawing_samples = ImagesSamplesSave("./data_svhn", vae, (3, 32, 32), every_n_epochs=1) extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=400), Printing(every_n_epochs=10), drawing_samples, Dump("./data_svhn/model_svhn") ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run()
def test_vae(): activation = Rectifier() full_weights_init = Orthogonal() weights_init = full_weights_init layers = [784, 400, 20] encoder_layers = layers[:-1] encoder_mlp = MLP([activation] * (len(encoder_layers)-1), encoder_layers, name="MLP_enc", biases_init=Constant(0.), weights_init=weights_init) enc_dim = encoder_layers[-1] z_dim = layers[-1] #sampler = Qlinear(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) decoder_layers = layers[:] ## includes z_dim as first layer decoder_layers.reverse() decoder_mlp = MLP([activation] * (len(decoder_layers)-2) + [Rectifier()], decoder_layers, name="MLP_dec", biases_init=Constant(0.), weights_init=weights_init) vae = VAEModel(encoder_mlp, sampler, decoder_mlp) vae.initialize() x = T.matrix('features') batch_size = 124 x_recons, kl_terms = vae.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, T.clip(x_recons, 1e-5, 1 - 1e-5)) recons_term.name = "recons_term" cost = recons_term + kl_terms.mean() cost.name = "cost" cg = ComputationGraph(cost) temp = cg.parameters for t, i in zip(temp, range(len(temp))): t.name = t.name+str(i)+"vae_mnist" step_rule = RMSProp(0.001, 0.95) train_set = MNIST('train') train_set.sources = ("features", ) test_set = MNIST("test") test_set.sources = ("features", ) data_stream = Flatten(DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))) data_stream_monitoring = Flatten(DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))) data_stream_test = Flatten(DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size))) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train", every_n_batches=10) monitor_valid = DataStreamMonitoring( variables=[cost], data_stream=data_stream_test, prefix="valid", every_n_batches=10) # drawing_samples = ImagesSamplesSave("../data_mnist", vae, (28, 28), every_n_epochs=1) extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_batches=1500), Printing(every_n_batches=10) ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run() from blocks.serialization import dump with closing(open('../data_mnist/model_0', 'w')) as f: dump(vae, f)
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel, live_plotting): image_size, channels, data_train, data_valid, data_test = datasets.get_data(dataset) train_stream = Flatten(DataStream.default_stream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size))) valid_stream = Flatten(DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size))) test_stream = Flatten(DataStream.default_stream(data_test, iteration_scheme=SequentialScheme(data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = channels * img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # Configure attention mechanism if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * channels * read_N ** 2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, channels=channels, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, channels=channels, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- if name is None: name = dataset # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S"); longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') x_recons, kl_terms = draw.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) plotting_extensions = [] if live_plotting: plotting_extensions = [ Plot(name, channels=plot_channels) ] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), #Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']), PartsOnlyCheckpoint("{}/{}".format(subdir,name), before_training=True, after_epoch=True, save_separately=['log', 'model']), SampleCheckpoint(image_size=image_size[0], channels=channels, save_subdir=subdir, before_training=True, after_epoch=True), ProgressBar(), Printing()] + plotting_extensions) if oldmodel is not None: print("Initializing parameters with old model %s"%oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
# Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = BinaryCrossEntropy().apply(x, x_hat) # This is the model: before applying dropout autoencoder = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, dropout_inputs, dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm: algo = GradientDescent( step_rule=solver_type, params=dropout_graph.parameters,
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel): image_size, data_train, data_valid, data_test = datasets.get_data(dataset) train_stream = Flatten( DataStream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size))) valid_stream = Flatten( DataStream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size))) test_stream = Flatten( DataStream(data_test, iteration_scheme=SequentialScheme(data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = time.strftime("%Y%m%d-%H%M%S") + "-" + name longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % ( dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']), #Checkpoint(image_size=image_size, save_subdir=subdir, path=pickle_file, before_training=False, after_epoch=True, save_separately=['log', 'model']), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(name, epochs, batch_size, learning_rate, z_dim, mix_dim, enc_dim, dec_dim, oldmodel): datasource = 'mnist' x_dim = 28*28 im_shape = (28, 28) im_rows = 28 im_cols = 28 rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "%s-enc%d-dec%d-zd%d-md%d-lr%s" % (name, enc_dim, dec_dim, z_dim, mix_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" encoder dimension: %d" % enc_dim) print(" decoder dimension: %d" % dec_dim) print(" z dimension: %d" % z_dim) print(" mix dimension: %d" % mix_dim) print() #---------------------------------------------------------------------- # setup the mixture weight sampler enc_x_to_z = CondNet(activations=[Rectifier()], dims=[x_dim, enc_dim, z_dim], **inits) enc_z_to_mix = MLP(activations=[Rectifier(), Tanh()], dims=[z_dim, enc_dim, (2*dec_dim + mix_dim)], **inits) dec_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) dec_mlp_in = MLP(activations=[None], dims=[(im_rows + dec_dim + mix_dim), 4*dec_dim], **inits) dec_mlp_out = MLP(activations=[None], dims=[dec_dim, im_rows], **inits) dm_model = DotMatrix( enc_x_to_z=enc_x_to_z, enc_z_to_mix=enc_z_to_mix, dec_rnn=dec_rnn, dec_mlp_in=dec_mlp_in, dec_mlp_out=dec_mlp_out, im_shape=im_shape, mix_dim=mix_dim) dm_model.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') x_recons, kl_terms = dm_model.reconstruct(x, x) x_recons.name = "OO_x_recons_OO" nll_term = BinaryCrossEntropy().apply(x, x_recons) nll_term.name = "nll_term" kld_term = kl_terms.mean() kld_term.name = "kld_term" nll_bound = nll_term + kld_term nll_bound.name = "nll_bound" # grab the computation graph for the VFE bound on NLL cg = ComputationGraph([nll_bound]) params = VariableFilter(roles=[PARAMETER])(cg.variables) # apply some l2 regularization to the model parameters reg_term = (1e-5 * sum([tensor.sum(p**2.0) for p in params])) reg_term.name = "reg_term" # compute the final cost of VFE + regularization total_cost = nll_bound + reg_term algorithm = GradientDescent( cost=total_cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) ) #------------------------------------------------------------------------ # Setup monitors monitors = [nll_bound, nll_term, kld_term, reg_term] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "valid_nll_bound"], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if datasource == 'mnist': mnist_train = BinarizedMNIST("train", sources=['features'], flatten=['features']) mnist_valid = BinarizedMNIST("valid", sources=['features'], flatten=['features']) #mnist_test = BinarizedMNIST("test", sources=['features'], flatten=['features']) train_stream = DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, batch_size)) valid_stream = DataStream(mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, batch_size)) #test_stream = DataStream(mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size)) else: raise Exception('Unknown name %s'%datasource) main_loop = MainLoop( model=Model(total_cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), DataStreamMonitoring( monitors, valid_stream, prefix="valid"), # DataStreamMonitoring( # monitors, # test_stream, # prefix="test"), Checkpoint(name+".pkl", after_epoch=True, save_separately=['log', 'model']), Plot(name, channels=plot_channels), ProgressBar(), Printing()]) if oldmodel is not None: print("Initializing parameters with old model %s"%oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()